This section installs and loads the required packages for data manipulation, visualization, and modelling.
# install.packages(c("readxl", "dplyr", "ggplot2", "lubridate", "caret", "randomForest", "xgboost", "corrplot", "stringr", "skimr", "tidyr"))
library(readxl)
library(dplyr)
library(ggplot2)
library(lubridate)
library(caret)
library(randomForest)
library(xgboost)
library(corrplot)
library(stringr)
library(skimr)
library(tidyr)
library(scales)
library(broom)
library(gridExtra)
library(factoextra)
The dataset is imported from an Excel file and briefly inspected to understand its structure.
data <- read_excel("Advanced Topics Individual Dataset.xlsx")
knitr::kable(head(data))
| Customer ID | Age | Gender | Loyalty Member | Product Type | SKU | Rating | Order Status | Payment Method | Total Price | Unit Price | Quantity | Purchase Date | Shipping Type | Add-ons Purchased | Add-on Total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1000 | 53 | Male | No | Smartphone | SKU1004 | 2 | Cancelled | Credit Card | 5538.33 | 791.19 | 7 | 2024-03-20 | Standard | Accessory,Accessory,Accessory | 40.21 |
| 1000 | 53 | Male | No | Tablet | SKU1002 | 3 | Completed | Paypal | 741.09 | 247.03 | 3 | 2024-04-20 | Overnight | Impulse Item | 26.09 |
| 1002 | 41 | Male | No | Laptop | SKU1005 | 3 | Completed | Credit Card | 1855.84 | 463.96 | 4 | 2023-10-17 | Express | NA | 0.00 |
| 1002 | 41 | Male | Yes | Smartphone | SKU1004 | 2 | Completed | Cash | 3164.76 | 791.19 | 4 | 2024-08-09 | Overnight | Impulse Item,Impulse Item | 60.16 |
| 1003 | 75 | Male | Yes | Smartphone | SKU1001 | 5 | Completed | Cash | 41.50 | 20.75 | 2 | 2024-05-21 | Express | Accessory | 35.56 |
| 1004 | 41 | Female | No | Smartphone | SKU1001 | 5 | Completed | Credit Card | 83.00 | 20.75 | 4 | 2024-05-26 | Standard | Impulse Item,Accessory | 65.78 |
Column names are cleaned to ensure consistency and avoid issues in later analysis.
colnames(data) <- colnames(data) %>%
str_replace_all(" ", "_") %>%
str_replace_all("[^[:alnum:]_]", "")
knitr::kable(colnames(data))
| x |
|---|
| Customer_ID |
| Age |
| Gender |
| Loyalty_Member |
| Product_Type |
| SKU |
| Rating |
| Order_Status |
| Payment_Method |
| Total_Price |
| Unit_Price |
| Quantity |
| Purchase_Date |
| Shipping_Type |
| Addons_Purchased |
| Addon_Total |
Missing values are checked across all variables.
na_summary <- sapply(data, function(x) sum(is.na(x)))
knitr::kable(na_summary)
| x | |
|---|---|
| Customer_ID | 0 |
| Age | 0 |
| Gender | 1 |
| Loyalty_Member | 0 |
| Product_Type | 0 |
| SKU | 0 |
| Rating | 0 |
| Order_Status | 0 |
| Payment_Method | 0 |
| Total_Price | 0 |
| Unit_Price | 0 |
| Quantity | 0 |
| Purchase_Date | 0 |
| Shipping_Type | 0 |
| Addons_Purchased | 4868 |
| Addon_Total | 0 |
Missing values in add-on related variables are handled.
data$Addons_Purchased[is.na(data$Addons_Purchased)] <- "None"
data$Addon_Total[is.na(data$Addon_Total)] <- 0
Categorical values are standardized for consistency.
data$Payment_Method <- str_to_title(data$Payment_Method)
data$Payment_Method[data$Payment_Method == "Paypal"] <- "PayPal"
Variables are converted into appropriate formats for analysis.
data$Purchase_Date <- as.Date(data$Purchase_Date)
data$Customer_ID <- as.character(data$Customer_ID)
data$Gender <- as.factor(data$Gender)
data$Loyalty_Member <- as.factor(data$Loyalty_Member)
data$Product_Type <- as.factor(data$Product_Type)
data$Payment_Method <- as.factor(data$Payment_Method)
data$Shipping_Type <- as.factor(data$Shipping_Type)
data$Order_Status <- as.factor(data$Order_Status)
str(data)
## tibble [20,000 × 16] (S3: tbl_df/tbl/data.frame)
## $ Customer_ID : chr [1:20000] "1000" "1000" "1002" "1002" ...
## $ Age : num [1:20000] 53 53 41 41 75 41 25 25 24 24 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 1 1 1 2 2 ...
## $ Loyalty_Member : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 1 1 1 1 2 ...
## $ Product_Type : Factor w/ 5 levels "Headphones","Laptop",..: 3 5 2 3 3 3 4 2 3 2 ...
## $ SKU : chr [1:20000] "SKU1004" "SKU1002" "SKU1005" "SKU1004" ...
## $ Rating : num [1:20000] 2 3 3 2 5 5 3 3 2 3 ...
## $ Order_Status : Factor w/ 2 levels "Cancelled","Completed": 1 2 2 2 2 2 2 2 1 2 ...
## $ Payment_Method : Factor w/ 5 levels "Bank Transfer",..: 3 5 3 2 2 3 5 4 4 2 ...
## $ Total_Price : num [1:20000] 5538.3 741.1 1855.8 3164.8 41.5 ...
## $ Unit_Price : num [1:20000] 791.2 247 464 791.2 20.8 ...
## $ Quantity : num [1:20000] 7 3 4 4 2 4 9 9 7 9 ...
## $ Purchase_Date : Date[1:20000], format: "2024-03-20" "2024-04-20" ...
## $ Shipping_Type : Factor w/ 5 levels "Expedited","Express",..: 5 3 2 3 2 5 3 3 5 2 ...
## $ Addons_Purchased: chr [1:20000] "Accessory,Accessory,Accessory" "Impulse Item" "None" "Impulse Item,Impulse Item" ...
## $ Addon_Total : num [1:20000] 40.2 26.1 0 60.2 35.6 ...
A summary of the cleaned dataset is displayed.
data_full <- data
data_clean <- data %>%
filter(Order_Status == "Completed")
skim(data_clean)
| Name | data_clean |
| Number of rows | 13432 |
| Number of columns | 16 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| Date | 1 |
| factor | 6 |
| numeric | 6 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Customer_ID | 0 | 1 | 4 | 5 | 0 | 9466 | 0 |
| SKU | 0 | 1 | 6 | 7 | 0 | 10 | 0 |
| Addons_Purchased | 0 | 1 | 4 | 55 | 0 | 76 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| Purchase_Date | 0 | 1 | 2023-09-24 | 2024-09-23 | 2024-04-24 | 366 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Gender | 1 | 1 | FALSE | 2 | Mal: 6780, Fem: 6651 |
| Loyalty_Member | 0 | 1 | FALSE | 2 | No: 10512, Yes: 2920 |
| Product_Type | 0 | 1 | FALSE | 5 | Sma: 4004, Tab: 2745, Lap: 2686, Sma: 2636 |
| Order_Status | 0 | 1 | FALSE | 1 | Com: 13432, Can: 0 |
| Payment_Method | 0 | 1 | FALSE | 5 | Cre: 3899, Pay: 3863, Ban: 2259, Cas: 1727 |
| Shipping_Type | 0 | 1 | FALSE | 5 | Sta: 4561, Ove: 2247, Exp: 2227, Exp: 2210 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Age | 0 | 1 | 48.97 | 18.06 | 18.00 | 33.00 | 49.00 | 65.00 | 80.00 | ▇▇▇▇▇ |
| Rating | 0 | 1 | 3.10 | 1.23 | 1.00 | 2.00 | 3.00 | 4.00 | 5.00 | ▂▃▇▂▅ |
| Total_Price | 0 | 1 | 3173.74 | 2541.48 | 20.75 | 1083.54 | 2534.49 | 4639.60 | 11396.80 | ▇▅▃▂▁ |
| Unit_Price | 0 | 1 | 577.88 | 312.83 | 20.75 | 361.18 | 463.96 | 791.19 | 1139.68 | ▂▇▂▆▂ |
| Quantity | 0 | 1 | 5.48 | 2.87 | 1.00 | 3.00 | 5.00 | 8.00 | 10.00 | ▇▇▇▇▇ |
| Addon_Total | 0 | 1 | 62.21 | 57.96 | 0.00 | 7.19 | 52.11 | 93.56 | 292.77 | ▇▅▂▁▁ |
New variables are created to capture additional transaction characteristics.
# Create Transaction ID
data_clean <- data_clean %>%
mutate(Transaction_ID = row_number())
# Extract Month from Purchase Date
data_clean$Month <- month(data_clean$Purchase_Date)
# Installment Payment (1 = Credit Card / PayPal, else 0)
data_clean$Installment_Payment <- ifelse(
data_clean$Payment_Method %in% c("Credit Card", "PayPal"), 1, 0
)
# Handling Multi-Value Categorical Variable (Add-ons Purchased)
data_clean <- data_clean %>%
mutate(Has_Addon = ifelse(
is.na(Addons_Purchased) |
Addons_Purchased == "" |
tolower(trimws(Addons_Purchased)) == "none",
0,
str_count(Addons_Purchased, ",") + 1
))
addons_df <- data_clean %>%
separate_rows(Addons_Purchased, sep = ",") %>%
mutate(Addons_Purchased = str_trim(Addons_Purchased)) %>%
filter(
!is.na(Addons_Purchased),
Addons_Purchased != "",
tolower(Addons_Purchased) != "none"
) %>%
group_by(Transaction_ID, Addons_Purchased) %>%
summarise(count = n(), .groups = "drop") %>%
pivot_wider(
names_from = Addons_Purchased,
values_from = count,
values_fill = 0
)
addon_cols <- setdiff(colnames(addons_df), "Transaction_ID")
data_clean <- data_clean %>%
left_join(addons_df, by = "Transaction_ID") %>%
mutate(across(all_of(addon_cols), ~replace_na(., 0)))
data_clean <- data_clean %>%
select(
Transaction_ID,
Customer_ID,
Age,
Gender,
Loyalty_Member,
Purchase_Date,
Month,
Product_Type,
SKU,
Order_Status,
Shipping_Type,
Payment_Method,
Installment_Payment,
Quantity,
Unit_Price,
Total_Price,
Addon_Total,
Rating,
Has_Addon,
`Impulse Item`,
Accessory,
`Extended Warranty`
)
# Check new variables
knitr::kable(head(data_clean))
| Transaction_ID | Customer_ID | Age | Gender | Loyalty_Member | Purchase_Date | Month | Product_Type | SKU | Order_Status | Shipping_Type | Payment_Method | Installment_Payment | Quantity | Unit_Price | Total_Price | Addon_Total | Rating | Has_Addon | Impulse Item | Accessory | Extended Warranty |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1000 | 53 | Male | No | 2024-04-20 | 4 | Tablet | SKU1002 | Completed | Overnight | PayPal | 1 | 3 | 247.03 | 741.09 | 26.09 | 3 | 1 | 1 | 0 | 0 |
| 2 | 1002 | 41 | Male | No | 2023-10-17 | 10 | Laptop | SKU1005 | Completed | Express | Credit Card | 1 | 4 | 463.96 | 1855.84 | 0.00 | 3 | 0 | 0 | 0 | 0 |
| 3 | 1002 | 41 | Male | Yes | 2024-08-09 | 8 | Smartphone | SKU1004 | Completed | Overnight | Cash | 0 | 4 | 791.19 | 3164.76 | 60.16 | 2 | 2 | 2 | 0 | 0 |
| 4 | 1003 | 75 | Male | Yes | 2024-05-21 | 5 | Smartphone | SKU1001 | Completed | Express | Cash | 0 | 2 | 20.75 | 41.50 | 35.56 | 5 | 1 | 0 | 1 | 0 |
| 5 | 1004 | 41 | Female | No | 2024-05-26 | 5 | Smartphone | SKU1001 | Completed | Standard | Credit Card | 1 | 4 | 20.75 | 83.00 | 65.78 | 5 | 2 | 1 | 1 | 0 |
| 6 | 1005 | 25 | Female | No | 2024-01-30 | 1 | Smartwatch | SKU1003 | Completed | Overnight | PayPal | 1 | 9 | 844.83 | 7603.47 | 0.00 | 3 | 0 | 0 | 0 | 0 |
Transaction data is aggregated to the customer level to capture purchasing behaviour.
customer_data <- data_clean %>%
group_by(Customer_ID) %>%
summarise(
Age = first(Age),
Gender = first(Gender),
Ever_Loyal = max(ifelse(Loyalty_Member == "Yes", 1, 0)),
Total_Transactions = n(),
Total_Spend = sum(Total_Price, na.rm = TRUE),
Avg_Spend = mean(Total_Price, na.rm = TRUE),
Total_Quantity = sum(Quantity, na.rm = TRUE),
Has_Addon = sum(Has_Addon, na.rm = TRUE),
`Impulse Item` = sum(`Impulse Item`, na.rm = TRUE),
Accessory = sum(Accessory, na.rm = TRUE),
`Extended Warranty` = sum(`Extended Warranty`, na.rm = TRUE),
.groups = "drop"
)
# View dataset
knitr::kable(head(customer_data))
| Customer_ID | Age | Gender | Ever_Loyal | Total_Transactions | Total_Spend | Avg_Spend | Total_Quantity | Has_Addon | Impulse Item | Accessory | Extended Warranty |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1000 | 53 | Male | 0 | 1 | 741.09 | 741.090 | 3 | 1 | 1 | 0 | 0 |
| 1002 | 41 | Male | 1 | 2 | 5020.60 | 2510.300 | 8 | 2 | 2 | 0 | 0 |
| 1003 | 75 | Male | 1 | 1 | 41.50 | 41.500 | 2 | 1 | 0 | 1 | 0 |
| 1004 | 41 | Female | 0 | 1 | 83.00 | 83.000 | 4 | 2 | 1 | 1 | 0 |
| 1005 | 25 | Female | 0 | 2 | 11779.11 | 5889.555 | 18 | 2 | 0 | 0 | 2 |
| 1006 | 24 | Male | 1 | 2 | 6645.94 | 3322.970 | 19 | 3 | 2 | 1 | 0 |
The distribution of variables is summarised to identify patterns and variability.
eda_table <- function(df, var) {
df %>%
group_by({{ var }}) %>%
summarise(
Count = n(),
Percentage = n() / nrow(df),
Avg_Value = mean(Total_Price, na.rm = TRUE),
Total_Value = sum(Total_Price, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(Count))
}
knitr::kable(head(eda_table(data_clean, Age)))
| Age | Count | Percentage | Avg_Value | Total_Value |
|---|---|---|---|---|
| 61 | 259 | 0.0192823 | 3183.235 | 824458.0 |
| 40 | 253 | 0.0188356 | 2903.359 | 734549.8 |
| 28 | 246 | 0.0183145 | 2996.149 | 737052.8 |
| 67 | 244 | 0.0181656 | 3006.334 | 733545.5 |
| 50 | 238 | 0.0177189 | 3253.173 | 774255.1 |
| 37 | 236 | 0.0175700 | 3043.013 | 718151.0 |
knitr::kable(eda_table(data_clean, Gender))
| Gender | Count | Percentage | Avg_Value | Total_Value |
|---|---|---|---|---|
| Male | 6780 | 0.5047647 | 3152.216 | 21372025.22 |
| Female | 6651 | 0.4951608 | 3196.048 | 21256916.03 |
| NA | 1 | 0.0000744 | 674.320 | 674.32 |
knitr::kable(eda_table(data_clean, Month))
| Month | Count | Percentage | Avg_Value | Total_Value |
|---|---|---|---|---|
| 1 | 1399 | 0.1041543 | 3228.219 | 4516278 |
| 5 | 1371 | 0.1020697 | 3255.226 | 4462915 |
| 7 | 1356 | 0.1009529 | 3289.784 | 4460946 |
| 6 | 1349 | 0.1004318 | 3310.597 | 4465995 |
| 3 | 1323 | 0.0984961 | 3194.351 | 4226127 |
| 8 | 1323 | 0.0984961 | 3309.473 | 4378433 |
| 4 | 1284 | 0.0955926 | 3349.162 | 4300324 |
| 2 | 1227 | 0.0913490 | 3176.243 | 3897250 |
| 9 | 1125 | 0.0837552 | 3258.562 | 3665883 |
| 10 | 613 | 0.0456373 | 2537.236 | 1555326 |
| 11 | 537 | 0.0399792 | 2588.671 | 1390116 |
| 12 | 525 | 0.0390858 | 2495.282 | 1310023 |
knitr::kable(eda_table(data_clean, Product_Type))
| Product_Type | Count | Percentage | Avg_Value | Total_Value |
|---|---|---|---|---|
| Smartphone | 4004 | 0.2980941 | 3598.361 | 14407836 |
| Tablet | 2745 | 0.2043627 | 2813.345 | 7722632 |
| Laptop | 2686 | 0.1999702 | 3114.633 | 8365905 |
| Smartwatch | 2636 | 0.1962478 | 3565.475 | 9398591 |
| Headphones | 1361 | 0.1013252 | 2009.295 | 2734651 |
knitr::kable(eda_table(data_clean, SKU))
| SKU | Count | Percentage | Avg_Value | Total_Value |
|---|---|---|---|---|
| SKU1002 | 1382 | 0.1028886 | 1360.6312 | 1880392.4 |
| TBL345 | 1363 | 0.1014741 | 4286.3095 | 5842239.9 |
| HDP456 | 1360 | 0.1012507 | 2007.7359 | 2730520.8 |
| SKU1005 | 1352 | 0.1006552 | 2442.1142 | 3301738.4 |
| SMP234 | 1346 | 0.1002085 | 6254.6925 | 8418816.2 |
| LTP123 | 1334 | 0.0993151 | 3796.7148 | 5064817.5 |
| SKU1001 | 1329 | 0.0989428 | 112.0875 | 148964.2 |
| SKU1004 | 1329 | 0.0989428 | 4393.8335 | 5839404.7 |
| SWT567 | 1325 | 0.0986450 | 2546.4970 | 3374108.5 |
| SKU1003 | 1312 | 0.0976772 | 4594.9794 | 6028612.9 |
knitr::kable(eda_table(data_clean, Shipping_Type))
| Shipping_Type | Count | Percentage | Avg_Value | Total_Value |
|---|---|---|---|---|
| Standard | 4561 | 0.3395622 | 3154.360 | 14387038 |
| Overnight | 2247 | 0.1672871 | 2620.259 | 5887723 |
| Express | 2227 | 0.1657981 | 2529.005 | 5632093 |
| Expedited | 2210 | 0.1645325 | 3814.654 | 8430386 |
| Same Day | 2187 | 0.1628201 | 3791.667 | 8292376 |
knitr::kable(head(eda_table(data_clean, Quantity)))
| Quantity | Count | Percentage | Avg_Value | Total_Value |
|---|---|---|---|---|
| 2 | 1369 | 0.1019208 | 1150.322 | 1574791 |
| 3 | 1367 | 0.1017719 | 1719.858 | 2351047 |
| 8 | 1363 | 0.1014741 | 4631.195 | 6312319 |
| 9 | 1344 | 0.1000596 | 5236.902 | 7038397 |
| 4 | 1343 | 0.0999851 | 2308.096 | 3099773 |
| 6 | 1343 | 0.0999851 | 3517.713 | 4724289 |
knitr::kable(eda_table(data_clean, Rating))
| Rating | Count | Percentage | Avg_Value | Total_Value |
|---|---|---|---|---|
| 3 | 5339 | 0.3974836 | 3034.852 | 16203077 |
| 5 | 2677 | 0.1993002 | 1981.182 | 5303625 |
| 2 | 2642 | 0.1966945 | 4077.199 | 10771960 |
| 1 | 1392 | 0.1036331 | 3751.574 | 5222191 |
| 4 | 1382 | 0.1028886 | 3711.116 | 5128763 |
knitr::kable(eda_table(data_clean, Has_Addon))
| Has_Addon | Count | Percentage | Avg_Value | Total_Value |
|---|---|---|---|---|
| 2 | 3419 | 0.2545414 | 3110.484 | 10634744 |
| 3 | 3375 | 0.2512656 | 3185.780 | 10752008 |
| 1 | 3351 | 0.2494789 | 3201.507 | 10728250 |
| 0 | 3287 | 0.2447141 | 3198.848 | 10514615 |
The distribution of variables is visualized to identify patterns and variability.
# Function to create clean bar chart
plot_bar <- function(df, var, title_name) {
df %>%
filter(!is.na({{ var }})) %>%
ggplot(aes(x = as.factor({{ var }}))) +
geom_bar(fill = "navyblue") +
labs(
title = title_name,
x = NULL,
y = NULL
) +
theme_minimal() +
theme(
plot.title = element_text(size = 12, face = "bold"),
axis.text.x = element_text(size = 8, angle = 45, hjust = 1),
axis.text.y = element_text(size = 8)
)
}
# Create plots
p1 <- ggplot(data_clean, aes(x = Age)) +
geom_histogram(fill = "navyblue", bins = 20, color = "white", linedwidth = 0.2) +
scale_x_continuous(breaks = seq(0, 100, by = 10), limits = c(15, 85)) +
labs(title = "Age", x = NULL, y = NULL) +
theme_minimal() +
theme(
plot.title = element_text(size = 12, face = "bold"),
axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8)
)
p2 <- plot_bar(data_clean, Gender, "Gender")
p3 <- plot_bar(data_clean, Month, "Month")
p4 <- plot_bar(data_clean, Product_Type, "Product Type")
p5 <- plot_bar(data_clean, SKU, "SKU")
p6 <- plot_bar(data_clean, Shipping_Type, "Shipping Type")
p7 <- plot_bar(data_clean, Quantity, "Quantity")
p8 <- plot_bar(data_clean, Rating, "Rating")
p9 <- plot_bar(data_clean, Has_Addon, "Has Addon")
# Arrange in 3x3 grid
grid.arrange(p1, p2, p3,
p4, p5, p6,
p7, p8, p9,
ncol = 3)
Displays the relationship between product types and SKUs
sku_table <- data_clean %>%
select(Product_Type, SKU) %>%
distinct() %>%
arrange(Product_Type, SKU)
knitr::kable(sku_table)
| Product_Type | SKU |
|---|---|
| Headphones | HDP456 |
| Headphones | SKU1003 |
| Laptop | LTP123 |
| Laptop | SKU1004 |
| Laptop | SKU1005 |
| Smartphone | SKU1001 |
| Smartphone | SKU1004 |
| Smartphone | SKU1005 |
| Smartphone | SMP234 |
| Smartwatch | SKU1003 |
| Smartwatch | SWT567 |
| Tablet | SKU1002 |
| Tablet | TBL345 |
Shows how spending changes across months, highlighting overall fluctuations over time
# Prepare monthly data
monthly_df <- data_clean %>%
group_by(Month) %>%
summarise(
Total_Spend = sum(Total_Price, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(Month)
# Convert Month (numeric) to Jan–Dec
monthly_df$Month <- factor(monthly_df$Month,
levels = 1:12,
labels = c("Jan","Feb","Mar","Apr","May","Jun",
"Jul","Aug","Sep","Oct","Nov","Dec"))
# Plot
p <- ggplot(monthly_df, aes(x = Month, y = Total_Spend, group = 1)) +
geom_line(color = "navyblue", linewidth = 0.9) +
geom_point(color = "navyblue", size = 2.5) +
geom_smooth(method = "lm", se = FALSE,
color = "red",
linetype = "dashed",
linewidth = 0.8) +
labs(
title = "Monthly Total Spend",
x = "Month",
y = "Total Spend"
) +
scale_y_continuous(labels = comma) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10)
)
print(p)
A transaction-level dataset is prepared for modelling.
model_df <- data_clean %>%
select(
Total_Price,
Product_Type,
Has_Addon,
`Impulse Item`,
Accessory,
`Extended Warranty`,
Installment_Payment,
Payment_Method,
Loyalty_Member,
Age,
Gender,
Shipping_Type
) %>%
na.omit()
model_df <- model_df %>%
rename(
Impulse_Item = `Impulse Item`,
Extended_Warranty = `Extended Warranty`
) %>%
mutate(
across(c(Product_Type, Has_Addon, Installment_Payment,
Payment_Method, Loyalty_Member, Gender,
Shipping_Type),
as.factor)
)
A linear regression model is fitted to identify significant predictors of total price.
model_lm <- lm(Total_Price ~ ., data = model_df)
knitr::kable(tidy(model_lm))
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 2532.9483495 | 110.547949 | 22.9126671 | 0.0000000 |
| Product_TypeLaptop | 1741.8436603 | 82.088398 | 21.2191211 | 0.0000000 |
| Product_TypeSmartphone | 2407.8120918 | 79.228278 | 30.3908169 | 0.0000000 |
| Product_TypeSmartwatch | 2182.1868738 | 82.267429 | 26.5255264 | 0.0000000 |
| Product_TypeTablet | 1432.5368643 | 81.744659 | 17.5245317 | 0.0000000 |
| Has_Addon1 | -23.9619200 | 63.405396 | -0.3779161 | 0.7054989 |
| Has_Addon2 | -80.4262948 | 75.909889 | -1.0594969 | 0.2893926 |
| Has_Addon3 | -5.6159336 | 92.603866 | -0.0606447 | 0.9516431 |
| Impulse_Item | -10.1411420 | 41.208594 | -0.2460929 | 0.8056141 |
| Accessory | 30.4777352 | 41.053373 | 0.7423930 | 0.4578623 |
| Extended_Warranty | NA | NA | NA | NA |
| Installment_Payment1 | -288.8079095 | 65.834027 | -4.3869094 | 0.0000116 |
| Payment_MethodCash | -760.0698786 | 86.799571 | -8.7566087 | 0.0000000 |
| Payment_MethodCredit Card | -67.9405732 | 54.442797 | -1.2479258 | 0.2120800 |
| Payment_MethodDebit Card | -582.1749068 | 87.284872 | -6.6698259 | 0.0000000 |
| Payment_MethodPayPal | NA | NA | NA | NA |
| Loyalty_MemberYes | -50.7459147 | 50.180250 | -1.0112726 | 0.3119042 |
| Age | 0.4062262 | 1.145615 | 0.3545924 | 0.7229006 |
| GenderMale | -45.1755820 | 41.398827 | -1.0912285 | 0.2751919 |
| Shipping_TypeExpress | -1509.1345681 | 79.742130 | -18.9251850 | 0.0000000 |
| Shipping_TypeOvernight | -1400.7894690 | 79.955822 | -17.5195430 | 0.0000000 |
| Shipping_TypeSame Day | -52.6481736 | 72.337531 | -0.7278127 | 0.4667410 |
| Shipping_TypeStandard | -776.9077880 | 64.460805 | -12.0524059 | 0.0000000 |
A random forest model is fitted to identify important features driving total price.
set.seed(123)
model_rf <- randomForest(
Total_Price ~ .,
data = model_df,
importance = TRUE
)
importance_df <- as.data.frame(importance(model_rf))
importance_df$Variable <- rownames(importance_df)
ggplot(importance_df, aes(x = reorder(Variable, `%IncMSE`), y = `%IncMSE`)) +
geom_col(aes(fill = `%IncMSE` < 25)) +
scale_fill_manual(
values = c("TRUE" = "darkred", "FALSE" = "navyblue"),
guide = "none"
) +
coord_flip() +
labs(
title = "Feature Importance (Random Forest)",
subtitle = "Low-importance variables highlighted in red",
x = NULL,
y = "% Increase in MSE"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 11),
axis.text = element_text(size = 10)
)
A Customer-level dataset is prepared for modelling.
clustering_df <- customer_data %>%
select(
Age,
Ever_Loyal,
Total_Transactions,
Total_Spend,
Avg_Spend,
Total_Quantity,
Has_Addon,
`Impulse Item`,
Accessory,
`Extended Warranty`
)
clustering_scaled <- scale(clustering_df)
The elbow method is used to determine the optimal number of clusters.
fviz_nbclust(clustering_scaled, kmeans, method = "wss") +
labs(title = "Elbow Method for Optimal Clusters")
K-means clustering is performed to segment customers into groups based on their purchasing behaviour.
set.seed(123)
kmeans_model <- kmeans(clustering_scaled, centers = 3, nstart = 25)
fviz_cluster(kmeans_model, data = clustering_scaled)
clustering_df$Cluster <- as.factor(kmeans_model$cluster)
cluster_profiling <- clustering_df %>%
group_by(Cluster) %>%
summarise(
Avg_Age = mean(Age),
Avg_Loyalty = mean(Ever_Loyal),
Avg_Spend = mean(Total_Spend),
Avg_Transactions = mean(Total_Transactions),
Avg_Quantity = mean(Total_Quantity),
Avg_Addon = mean(Has_Addon)
)
knitr::kable(cluster_profiling)
| Cluster | Avg_Age | Avg_Loyalty | Avg_Spend | Avg_Transactions | Avg_Quantity | Avg_Addon |
|---|---|---|---|---|---|---|
| 1 | 48.38653 | 0.4834197 | 8218.609 | 2.435751 | 14.007772 | 4.435233 |
| 2 | 49.61802 | 0.2372881 | 7054.542 | 1.248863 | 9.212071 | 1.489872 |
| 3 | 49.26129 | 0.2282587 | 1896.182 | 1.115888 | 4.754935 | 1.592730 |
Payment method and installment payment summaries are created.
payment_summary <- data_clean %>%
mutate(
Payment_Group = ifelse(
Payment_Method %in% c("PayPal", "Credit Card"),
Payment_Method,
"Other"
)
) %>%
group_by(Payment_Method, Payment_Group) %>%
summarise(
Transaction_Count = n(),
.groups = "drop"
)
installment_summary <- data_clean %>%
mutate(Installment_Label = ifelse(Installment_Payment == 1, "Yes", "No")) %>%
group_by(Installment_Label) %>%
summarise(
Transaction_Count = n(),
.groups = "drop"
)
This plot compares the transaction counts for different payment methods.
ggplot(payment_summary, aes(x = Payment_Method, y = Transaction_Count)) +
geom_col(aes(fill = Payment_Method %in% c("PayPal", "Credit Card"))) +
scale_fill_manual(
values = c("TRUE" = "navyblue", "FALSE" = "darkred"),
guide = "none"
) +
labs(
title = "Transaction by Payment Method",
x = NULL,
y = NULL
) +
theme_minimal() +
theme(
plot.title = element_text(size = 18, face = "bold"),
axis.text = element_text(size = 13, angle = 0, hjust = 0.5)
)
This plot compares the transaction counts for customers who used installment payment versus those who did not.
ggplot(installment_summary, aes(x = Installment_Label, y = Transaction_Count)) +
geom_col(aes(fill = Installment_Label)) +
scale_fill_manual(
values = c("Yes" = "navyblue", "No" = "darkred"),
guide = "none"
) +
labs(
title = "Transaction by Instalment Payment",
x = NULL,
y = NULL
) +
theme_minimal() +
theme(
plot.title = element_text(size = 18, face = "bold"),
axis.text = element_text(size = 13, angle = 0, hjust = 0.5)
)
This table summarizes the average spend and quantity for customers based on whether they used installment payment or not
payment_behavior <- data_clean %>%
group_by(Installment_Payment) %>%
summarise(
Avg_Spend = round(mean(Total_Price), 2),
Avg_Quantity = round(mean(Quantity), 2),
.groups = "drop"
)
knitr::kable(payment_behavior)
| Installment_Payment | Avg_Spend | Avg_Quantity |
|---|---|---|
| 0 | 3066.85 | 5.50 |
| 1 | 3251.81 | 5.47 |