# Set the default CRAN mirror for the session
options(repos = c(CRAN = "https://cloud.r-project.org/"))
# install packages
install.packages("dplyr")
## package 'dplyr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\obuczki\AppData\Local\Temp\RtmpIJQyoX\downloaded_packages
install.packages("ggplot2")
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\obuczki\AppData\Local\Temp\RtmpIJQyoX\downloaded_packages
install.packages("GGally")
## package 'GGally' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\obuczki\AppData\Local\Temp\RtmpIJQyoX\downloaded_packages
install.packages("psych")
## package 'psych' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\obuczki\AppData\Local\Temp\RtmpIJQyoX\downloaded_packages
install.packages("kableExtra")
## package 'kableExtra' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\obuczki\AppData\Local\Temp\RtmpIJQyoX\downloaded_packages
# load packages in the environment
library("dplyr")
library("ggplot2")
library("GGally")
library("psych")
library("knitr")
library("kableExtra")
# Load the data from GitHub
customer_df <- read.csv("https://raw.githubusercontent.com/obuczkipp/CustomerSegmentation/main/customer_segmentation_data.csv")
# head of the data
head(customer_df)
## id age gender income spending_score membership_years purchase_frequency
## 1 1 38 Female 99342 90 3 24
## 2 2 21 Female 78852 60 2 42
## 3 3 60 Female 126573 30 2 28
## 4 4 40 Other 47099 74 9 5
## 5 5 65 Female 140621 21 3 25
## 6 6 31 Other 57305 24 3 30
## preferred_category last_purchase_amount
## 1 Groceries 113.53
## 2 Sports 41.93
## 3 Clothing 424.36
## 4 Home & Garden 991.93
## 5 Electronics 347.08
## 6 Home & Garden 86.85
str(customer_df)
## 'data.frame': 1000 obs. of 9 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 38 21 60 40 65 31 19 43 53 55 ...
## $ gender : chr "Female" "Female" "Female" "Other" ...
## $ income : int 99342 78852 126573 47099 140621 57305 54319 108115 34424 45839 ...
## $ spending_score : int 90 60 30 74 21 24 68 94 29 55 ...
## $ membership_years : int 3 2 2 9 3 3 5 9 6 7 ...
## $ purchase_frequency : int 24 42 28 5 25 30 43 27 7 2 ...
## $ preferred_category : chr "Groceries" "Sports" "Clothing" "Home & Garden" ...
## $ last_purchase_amount: num 113.5 41.9 424.4 991.9 347.1 ...
# Check missing values in the entire data frame
missing_values <- colSums(is.na(customer_df))
# Print the result
print(missing_values)
## id age gender
## 0 0 0
## income spending_score membership_years
## 0 0 0
## purchase_frequency preferred_category last_purchase_amount
## 0 0 0
# Check the summary of the data
summary(customer_df)
## id age gender income
## Min. : 1.0 Min. :18.00 Length:1000 Min. : 30004
## 1st Qu.: 250.8 1st Qu.:30.00 Class :character 1st Qu.: 57912
## Median : 500.5 Median :45.00 Mode :character Median : 87846
## Mean : 500.5 Mean :43.78 Mean : 88501
## 3rd Qu.: 750.2 3rd Qu.:57.00 3rd Qu.:116110
## Max. :1000.0 Max. :69.00 Max. :149973
## spending_score membership_years purchase_frequency preferred_category
## Min. : 1.00 Min. : 1.000 Min. : 1.0 Length:1000
## 1st Qu.: 26.00 1st Qu.: 3.000 1st Qu.:15.0 Class :character
## Median : 50.00 Median : 5.000 Median :27.0 Mode :character
## Mean : 50.69 Mean : 5.469 Mean :26.6
## 3rd Qu.: 76.00 3rd Qu.: 8.000 3rd Qu.:39.0
## Max. :100.00 Max. :10.000 Max. :50.0
## last_purchase_amount
## Min. : 10.4
## 1st Qu.:218.8
## Median :491.6
## Mean :492.3
## 3rd Qu.:747.2
## Max. :999.7
# age
boxplot(customer_df$age, main = "Age Distribution", ylab = "Age in Years", col = "turquoise")
# Our boxplot is symmetric, just for as an example we can use the IQR method to detect outliers in the age variable.
# Calculate Q1 (25th percentile) and Q3 (75th percentile) for the age variable
Q1 <- quantile(customer_df$age, 0.25)
Q3 <- quantile(customer_df$age, 0.75)
# Calculate the IQR (Interquartile Range)
IQR_value <- Q3 - Q1
# Determine the lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
# Identify outliers in the age variable
outliers <- customer_df$age[customer_df$age < lower_bound | customer_df$age > upper_bound]
# Print the outliers
print(outliers)
## integer(0)
boxplot(customer_df$income, main = "Income Distribution", ylab = "Income in USD", col = "turquoise")
# Calculate Q1 (25th percentile) and Q3 (75th percentile) for the Income variable
Q1 <- quantile(customer_df$income, 0.25)
Q3 <- quantile(customer_df$income, 0.75)
# Calculate the IQR (Interquartile Range)
IQR_value <- Q3 - Q1
# Determine the lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
# Identify outliers in the age variable
outliers_2 <- customer_df$income[customer_df$income < lower_bound | customer_df$income > upper_bound]
# Print the outliers
print(outliers_2)
## integer(0)
# spending score
boxplot(customer_df$spending_score, main = "Spending Score Distribution", ylab = "Spending Score", col = "turquoise")
# box plot purchase frequency
boxplot(customer_df$purchase_frequency, main = "Purchase Frequency", ylab = "Number of shoping in period", col = "turquoise")
# last purchase amount
boxplot(customer_df$last_purchase_amount, main = "Last Purchase Amount", ylab = "Amount in USD", col = "turquoise")
# plot last_purchase_amount distribution
hist(customer_df$last_purchase_amount, main = "Last Purchase Amount", ylab = "Amount in USD", xlab = "Customers", col = "turquoise")
summary(customer_df$last_purchase_amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.4 218.8 491.6 492.3 747.2 999.7
# to compare the data with normal distribution, we need to create a linear transformation of the last_purchase_amount variable.
# Method 1
# The linear transformation consists of two steps:
# 1. Center the data: Subtract the mean of the data from each data point.
customer_df$avg_last_purchase_amount <- mean(customer_df$last_purchase_amount)
customer_df$last_purchase_amount_centered <- customer_df$last_purchase_amount - customer_df$avg_last_purchase_amount
ggplot(customer_df, aes(x = last_purchase_amount_centered)) +
geom_histogram(bins = 30, fill = "turquoise", color = "black") +
labs(title = "Centered Last Purchase Amount", x = "Amount in USD", y = "Frequency")
# 2. Scale the data: Divide the centered data by the standard deviation of the data.
customer_df$sd_last_purchase_amount <- sd(customer_df$last_purchase_amount)
customer_df$last_purchase_amount_scaled <- customer_df$last_purchase_amount_centered / customer_df$sd_last_purchase_amount
ggplot2::ggplot(customer_df, aes(x = last_purchase_amount_scaled)) +
ggplot2::geom_histogram(bins = 30, fill = "turquoise", color = "black") +
ggplot2::labs(title = "Scaled Last Purchase Amount", x = "Z-Value", y = "Frequency")
###Z-Score Formula
𝑋 X is the individual data point,
μ μ is the mean of the data,
𝜎 σ is the standard deviation of the data. The Z-score tells us how many standard deviations a particular data point is from the mean.
# Method 2
# Z-Score Transformation
# The linear transformation can be done in a single step using the scale() function in R.
# Scale the relevant numeric variables
set.seed(123) # Set seed for reproducibility
customer_df_scaled <- scale(customer_df[, c("age", "income", "membership_years", "spending_score", "purchase_frequency", "last_purchase_amount")])
customer_summary <- describe(customer_df_scaled)
kable(customer_summary, caption = "Summary Statistics of Scaled Customer Data") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
column_spec(1:ncol(customer_summary), width = "150px") %>% # Adjust column width
row_spec(0, bold = TRUE, color = "white", background = "#007bff") %>% # Highlight header row
add_header_above(c("Summary Statistics of Scaled Customer Data" = 14)) # Adjust header columns count
| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| age | 1 | 1000 | 0 | 1 | 0.0809056 | 0.0057837 | 1.281314 | -1.714043 | 1.676416 | 3.390458 | -0.0458622 | -1.196320 | 0.0316228 |
| income | 2 | 1000 | 0 | 1 | -0.0191436 | -0.0095213 | 1.266895 | -1.708895 | 1.795817 | 3.504712 | 0.0509118 | -1.171132 | 0.0316228 |
| membership_years | 3 | 1000 | 0 | 1 | -0.1642312 | -0.0031516 | 1.038333 | -1.564924 | 1.586635 | 3.151558 | 0.0297545 | -1.210142 | 0.0316228 |
| spending_score | 4 | 1000 | 0 | 1 | -0.0236573 | 0.0022017 | 1.280082 | -1.715928 | 1.703150 | 3.419078 | -0.0165276 | -1.219495 | 0.0316228 |
| purchase_frequency | 5 | 1000 | 0 | 1 | 0.0283635 | 0.0115139 | 1.249062 | -1.797011 | 1.643118 | 3.440129 | -0.0837142 | -1.134192 | 0.0316228 |
| last_purchase_amount | 6 | 1000 | 0 | 1 | -0.0025484 | -0.0060282 | 1.336597 | -1.629613 | 1.715642 | 3.345255 | 0.0175010 | -1.277063 | 0.0316228 |
## Determining the Optimal Number of Clusters
###To find the optimal number of clusters, we use the Elbow Method, which involves plotting the Within-Cluster Sum of Squares (WSS) for different numbers of clusters.
# select variables for scaling and clustering
Clustering_df <- customer_df_scaled[, c("age", "income", "membership_years", "spending_score", "purchase_frequency", "last_purchase_amount")]
head(Clustering_df)
## age income membership_years spending_score purchase_frequency
## [1,] -0.3844514 0.3167092 -0.8645775 1.3577884 -0.18225661
## [2,] -1.5146042 -0.2818750 -1.2147507 0.3217041 1.08146410
## [3,] 1.0780993 1.1122215 -1.2147507 -0.7143801 0.09857022
## [4,] -0.2514922 -1.2094907 1.2364614 0.8052101 -1.51618403
## [5,] 1.4104972 1.5226125 -0.8645775 -1.0252053 -0.11204990
## [6,] -0.8498085 -0.9113379 -0.8645775 -0.9215969 0.23898363
## last_purchase_amount
## [1,] -1.2808995
## [2,] -1.5230006
## [3,] -0.2298901
## [4,] 1.6892343
## [5,] -0.4911969
## [6,] -1.3711126
# We should find the optimal number of clusters for K-means algorithm.
# Elbow Method
wss_values <- numeric(15) # Create an empty vector to store the within-cluster sum of squares (WSS) values
set.seed(123) # Set seed for reproducibility
for (k in 1:15) {
kmeans_result <- kmeans(Clustering_df, centers = k, nstart = 25 # different random starting assignments
, iter.max = 300, algorithm = "Lloyd") # with iter.max = number of iterations
wss_values[k] <- kmeans_result$tot.withinss
}
# We should find on the visualization where the WSS value starts to decrease more slowly.
dotline_plot <- ggplot2::ggplot(data = data.frame(k = 1:15, WSS = wss_values), aes(x = k, y = wss_values)) +
ggplot2::geom_point(color = "turquoise", size = 4) + # turquoise points
ggplot2::geom_segment(
aes(x = k, xend = k, y = 0, yend = wss_values), # # The red dashed line, which extends from 0 to the WSS value at each k
linetype = 'dashed',
color = '#FF0000'
) +
ggplot2::geom_segment(
aes(x = 0, xend = k, y = WSS, yend = wss_values), # Black dashed horizontal line
linetype = 'dashed',
color = '#000000'
) +
ggplot2::labs(
title = "Elbow Method",
x = "Number of Clusters",
y = "Within-cluster Sum of Squares (WSS)"
) +
ggplot2::theme_minimal()
dotline_plot
# We see that the optimal number of clusters is 5, as the WSS value starts to decrease more slowly after this point.
# K-means Clustering
# We will use the kmeans() function to perform K-means clustering on the data.
optimal_k <- 5
set.seed(123) # Set seed for reproducibility
kmeans_result <- kmeans(Clustering_df, centers = optimal_k, nstart = 25, iter.max = 300, algorithm = "Lloyd")
# Print the result
print(kmeans_result)
## K-means clustering with 5 clusters of sizes 176, 194, 224, 216, 190
##
## Cluster means:
## age income membership_years spending_score purchase_frequency
## 1 0.0665521 0.6859450 0.854454327 0.30993046 0.47194224
## 2 -1.0064124 0.2027716 -0.895262791 -0.03594073 -0.30819441
## 3 -0.5289851 -0.3807565 0.611152201 0.23043006 0.12677827
## 4 0.6302855 -1.0042895 -0.002114008 -0.28139919 -0.03306736
## 5 0.8730623 0.7481683 -0.595497092 -0.20215457 -0.23435738
## last_purchase_amount
## 1 0.7921996
## 2 0.2071705
## 3 -0.9568564
## 4 0.6964549
## 5 -0.6090349
##
## Clustering vector:
## [1] 3 3 5 4 5 3 3 1 4 4 3 4 3 4 2 5 5 2 3 2 2 1 2 3 4 3 3 3 3 2 4 5 3 3 4 2 5
## [38] 1 3 3 2 5 3 1 4 5 5 3 3 5 3 2 1 3 4 4 4 1 4 2 3 3 4 5 2 2 2 3 3 4 2 4 2 3
## [75] 5 1 4 3 2 4 1 5 3 3 2 3 4 3 2 4 2 1 3 3 1 3 5 4 2 3 5 3 3 5 2 1 2 1 3 4 1
## [112] 1 4 4 5 1 5 4 1 4 1 3 3 3 4 3 4 3 4 1 4 3 1 3 3 1 3 5 3 2 1 3 1 2 1 4 4 2
## [149] 2 5 5 5 4 2 5 2 4 4 1 3 5 5 3 1 5 4 3 3 3 4 2 2 1 4 2 2 5 5 3 5 5 1 3 3 3
## [186] 1 5 5 2 2 2 3 4 1 4 5 3 3 3 5 1 4 5 5 1 2 3 5 5 3 3 2 1 2 5 1 4 5 4 1 3 4
## [223] 1 4 5 3 1 4 3 4 3 5 3 3 2 3 4 5 1 1 3 2 2 1 2 4 3 3 1 5 3 2 5 1 3 4 2 2 5
## [260] 4 3 4 1 4 1 1 1 2 3 3 4 5 1 1 2 2 4 4 2 4 4 4 1 5 5 5 2 5 1 2 4 4 4 2 1 5
## [297] 5 5 4 5 5 5 4 1 4 4 2 4 5 3 5 1 4 1 5 2 2 2 4 3 5 5 4 1 2 4 4 3 5 3 1 2 2
## [334] 5 5 1 5 1 3 4 4 4 3 5 1 5 3 5 4 5 5 3 1 4 2 1 5 5 4 3 1 2 2 1 2 1 4 1 4 3
## [371] 2 3 2 3 4 3 2 1 3 5 1 5 5 5 1 1 1 2 3 2 5 2 1 4 1 4 3 3 3 2 2 5 5 3 2 1 2
## [408] 1 4 2 5 4 2 5 4 4 4 1 4 4 1 3 5 5 3 1 4 4 3 5 3 1 1 4 2 5 4 5 5 1 3 2 1 5
## [445] 5 4 1 3 4 4 2 3 3 2 3 4 2 4 2 5 1 4 4 1 3 5 2 4 1 5 1 4 1 5 1 5 5 5 5 3 3
## [482] 5 4 3 4 5 3 1 2 2 2 1 5 1 1 2 5 4 3 4 2 4 3 3 3 5 5 3 5 2 1 1 2 4 3 3 2 1
## [519] 4 4 5 2 4 3 2 5 5 2 5 2 4 1 2 2 5 4 3 4 3 3 2 5 4 5 2 1 1 3 4 4 3 2 4 4 5
## [556] 2 1 3 1 5 5 3 2 5 1 2 3 4 3 1 4 3 1 5 4 5 3 4 3 5 1 2 2 3 4 5 2 4 2 2 5 2
## [593] 1 3 2 4 1 4 4 3 1 2 3 5 3 4 3 2 4 3 3 3 4 5 1 4 2 5 1 4 2 3 2 4 4 3 2 2 1
## [630] 5 1 2 1 1 2 2 4 3 3 1 5 1 5 5 5 1 2 2 2 1 2 3 5 5 3 3 2 5 3 5 4 4 2 4 3 1
## [667] 3 4 3 4 2 1 4 4 5 4 1 4 1 3 3 5 1 4 3 2 4 2 4 1 3 2 2 4 3 4 3 4 4 4 1 3 2
## [704] 5 3 2 5 3 3 3 3 2 2 2 1 1 1 5 1 5 1 2 2 2 3 3 3 2 1 3 4 2 4 3 1 1 2 1 5 5
## [741] 3 3 5 1 1 4 3 4 2 3 3 2 2 4 4 3 2 3 5 3 5 1 5 5 2 3 3 2 3 4 4 3 4 4 3 2 3
## [778] 3 4 4 5 1 3 4 3 2 1 3 3 5 5 2 5 2 4 4 4 4 2 2 5 1 5 4 5 1 2 2 1 1 3 4 1 1
## [815] 3 1 2 5 3 3 4 4 4 2 2 3 4 3 1 2 2 4 1 1 2 2 5 1 4 4 1 2 5 3 4 4 2 5 3 5 4
## [852] 5 2 3 2 5 1 5 3 4 2 3 1 1 5 2 4 5 3 5 1 4 2 2 3 1 5 3 2 5 5 4 5 3 1 1 4 2
## [889] 2 4 3 4 3 1 2 5 3 5 5 4 5 5 2 5 5 2 5 5 1 4 4 4 3 1 3 1 4 3 3 4 1 5 5 2 4
## [926] 3 4 1 2 1 3 4 2 5 5 5 3 4 2 4 4 4 5 4 4 1 5 2 5 2 1 4 1 5 3 4 4 3 3 4 2 1
## [963] 4 2 3 1 3 2 4 5 4 1 1 1 4 4 2 3 5 1 2 1 5 1 2 4 3 2 2 2 2 1 3 2 4 5 3 3 1
## [1000] 2
##
## Within cluster sum of squares by cluster:
## [1] 639.0520 773.6550 927.1793 881.0667 775.0201
## (between_SS / total_SS = 33.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Add the cluster assignments to the original data frame
customer_df$cluster <- as.factor(kmeans_result$cluster)
ggplot(customer_df, aes(x = age, y = income)) +
geom_point(aes(color = cluster)) +
facet_wrap(~ cluster, labeller = labeller(cluster = function(x) paste("Cluster", x))) + # "Cluster" to the facet titles
theme(
strip.text = element_text(size = 12),
panel.spacing = unit(2, "lines") # distance between facets
)
# Plot selected variables with clusters
plot_clusters <- ggpairs(customer_df,
columns = c(4:7, 9),
aes(color = cluster))
theme(
strip.text = element_text(size = 7),
strip.background = element_rect(color = "black", size = 0.5),
axis.text = element_text(size = 6),
legend.text = element_text(size = 7),
panel.spacing = unit(0.5, "lines")
)
## List of 5
## $ axis.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : num 6
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : num 7
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ panel.spacing : 'simpleUnit' num 0.5lines
## ..- attr(*, "unit")= int 3
## $ strip.background:List of 5
## ..$ fill : NULL
## ..$ colour : chr "black"
## ..$ linewidth : num 0.5
## ..$ linetype : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ strip.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : num 7
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
print(plot_clusters)
# Let's create some buyer personas based on the clusters.
describe(customer_df[customer_df$cluster == 1, ])
## vars n mean sd median trimmed
## id 1 176 510.19 281.54 490.00 507.77
## age 2 176 44.78 13.66 45.00 44.99
## gender* 3 176 2.06 0.79 2.00 2.08
## income 4 176 111981.23 23765.50 112898.00 113256.25
## spending_score 5 176 59.66 26.55 68.00 60.78
## membership_years 6 176 7.91 1.92 8.00 8.11
## purchase_frequency 7 176 33.32 11.83 35.00 34.08
## preferred_category* 8 176 3.07 1.33 3.00 3.09
## last_purchase_amount 9 176 726.64 178.54 752.00 739.49
## avg_last_purchase_amount 10 176 492.35 0.00 492.35 492.35
## last_purchase_amount_centered 11 176 234.29 178.54 259.65 247.14
## sd_last_purchase_amount 12 176 295.74 0.00 295.74 295.74
## last_purchase_amount_scaled 13 176 0.79 0.60 0.88 0.84
## cluster* 14 176 1.00 0.00 1.00 1.00
## mad min max range skew
## id 352.12 8.00 999.00 991.00 0.07
## age 17.79 18.00 69.00 51.00 -0.13
## gender* 1.48 1.00 3.00 2.00 -0.11
## income 28558.58 38683.00 149936.00 111253.00 -0.43
## spending_score 29.65 2.00 100.00 98.00 -0.34
## membership_years 1.48 2.00 10.00 8.00 -0.75
## purchase_frequency 13.34 5.00 50.00 45.00 -0.42
## preferred_category* 1.48 1.00 5.00 4.00 0.00
## last_purchase_amount 198.23 171.38 999.74 828.36 -0.59
## avg_last_purchase_amount 0.00 492.35 492.35 0.00 NaN
## last_purchase_amount_centered 198.23 -320.97 507.39 828.36 -0.59
## sd_last_purchase_amount 0.00 295.74 295.74 0.00 NaN
## last_purchase_amount_scaled 0.67 -1.09 1.72 2.80 -0.59
## cluster* 0.00 1.00 1.00 0.00 NaN
## kurtosis se
## id -1.20 21.22
## age -0.95 1.03
## gender* -1.42 0.06
## income -0.45 1791.39
## spending_score -1.06 2.00
## membership_years -0.30 0.14
## purchase_frequency -0.82 0.89
## preferred_category* -1.17 0.10
## last_purchase_amount -0.18 13.46
## avg_last_purchase_amount NaN 0.00
## last_purchase_amount_centered -0.18 13.46
## sd_last_purchase_amount NaN 0.00
## last_purchase_amount_scaled -0.18 0.05
## cluster* NaN 0.00
# Define the Mode function to determine the most frequent value
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
# Create an empty DataFrame for the buyer personas
bpersona_df <- data.frame()
# Loop through the clusters
for (i in 1:5) {
# Filter the DataFrame for the current cluster and female gender
cluster_df <- customer_df[customer_df$cluster == i & customer_df$gender == "Female", ]
# Calculate median values for key attributes
median_age <- median(cluster_df$age)
median_income <- median(cluster_df$income)
median_spending_score <- median(cluster_df$spending_score)
median_membership_years <- median(cluster_df$membership_years)
median_purchase_frequency <- median(cluster_df$purchase_frequency)
median_last_purchase_amount <- median(cluster_df$last_purchase_amount)
# Create a new DataFrame for the buyer persona
persona <- data.frame(
cluster = i,
age = median_age,
gender = "Female", # Since we filtered for females
income = median_income,
spending_score = median_spending_score,
membership_years = median_membership_years,
purchase_frequency = median_purchase_frequency,
last_purchase_amount = median_last_purchase_amount
)
# Add a new row for the persona to the bpersona_df
bpersona_df <- rbind(bpersona_df, persona)
}
print(bpersona_df)
## cluster age gender income spending_score membership_years
## 1 1 47.0 Female 117023.5 51 8
## 2 2 29.0 Female 93266.0 49 3
## 3 3 34.0 Female 69790.0 63 7
## 4 4 55.5 Female 53555.5 41 5
## 5 5 60.0 Female 116114.5 34 3
## purchase_frequency last_purchase_amount
## 1 37 759.755
## 2 18 499.070
## 3 28 191.900
## 4 25 722.090
## 5 19 240.390
# Create 2nd buyer persona group
# Define the Mode function to determine the most frequent value
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
# Create an empty DataFrame for the male buyer personas
bpersona_male_df <- data.frame()
# Loop through the clusters
for (i in 1:5) {
# Filter the DataFrame for the current cluster and male gender
cluster_df <- customer_df[customer_df$cluster == i & customer_df$gender == "Male", ]
# Calculate median values for key attributes
median_age <- median(cluster_df$age)
median_income <- median(cluster_df$income)
median_spending_score <- median(cluster_df$spending_score)
median_membership_years <- median(cluster_df$membership_years)
median_purchase_frequency <- median(cluster_df$purchase_frequency)
median_last_purchase_amount <- median(cluster_df$last_purchase_amount)
# Create a new DataFrame for the buyer persona
persona <- data.frame(
cluster = i,
age = median_age,
gender = "Male", # Since we filtered for males
income = median_income,
spending_score = median_spending_score,
membership_years = median_membership_years,
purchase_frequency = median_purchase_frequency,
last_purchase_amount = median_last_purchase_amount
)
# Add a new row for the persona to the bpersona_male_df
bpersona_male_df <- rbind(bpersona_male_df, persona)
}
print(bpersona_male_df)
## cluster age gender income spending_score membership_years purchase_frequency
## 1 1 41.0 Male 113154 66 8 36.0
## 2 2 27.0 Male 100292 44 3 26.5
## 3 3 35.0 Male 67039 61 8 30.0
## 4 4 52.0 Male 47990 31 6 28.0
## 5 5 54.5 Male 124733 43 4 26.5
## last_purchase_amount
## 1 741.41
## 2 651.42
## 3 149.07
## 4 711.04
## 5 238.53
# Append the two DataFrames
combined_df <- rbind(bpersona_df, bpersona_male_df)
# Add a Buyer Persona ID to each row
combined_df$Buyer_Persona_ID <- 1:nrow(combined_df)
print(combined_df)
## cluster age gender income spending_score membership_years
## 1 1 47.0 Female 117023.5 51 8
## 2 2 29.0 Female 93266.0 49 3
## 3 3 34.0 Female 69790.0 63 7
## 4 4 55.5 Female 53555.5 41 5
## 5 5 60.0 Female 116114.5 34 3
## 6 1 41.0 Male 113154.0 66 8
## 7 2 27.0 Male 100292.0 44 3
## 8 3 35.0 Male 67039.0 61 8
## 9 4 52.0 Male 47990.0 31 6
## 10 5 54.5 Male 124733.0 43 4
## purchase_frequency last_purchase_amount Buyer_Persona_ID
## 1 37.0 759.755 1
## 2 18.0 499.070 2
## 3 28.0 191.900 3
## 4 25.0 722.090 4
## 5 19.0 240.390 5
## 6 36.0 741.410 6
## 7 26.5 651.420 7
## 8 30.0 149.070 8
## 9 28.0 711.040 9
## 10 26.5 238.530 10
# Create a simple table using kable
kable(combined_df, caption = "Combined Buyer Persona Data") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>% # Bootstrap styling options
column_spec(1:ncol(combined_df), width = "150px") %>% # Adjusting the width of columns
row_spec(0, bold = TRUE, color = "white", background = "#007bff") %>% # Styling header row
add_header_above(c("Buyer Persona Data" = ncol(combined_df)))
| cluster | age | gender | income | spending_score | membership_years | purchase_frequency | last_purchase_amount | Buyer_Persona_ID |
|---|---|---|---|---|---|---|---|---|
| 1 | 47.0 | Female | 117023.5 | 51 | 8 | 37.0 | 759.755 | 1 |
| 2 | 29.0 | Female | 93266.0 | 49 | 3 | 18.0 | 499.070 | 2 |
| 3 | 34.0 | Female | 69790.0 | 63 | 7 | 28.0 | 191.900 | 3 |
| 4 | 55.5 | Female | 53555.5 | 41 | 5 | 25.0 | 722.090 | 4 |
| 5 | 60.0 | Female | 116114.5 | 34 | 3 | 19.0 | 240.390 | 5 |
| 1 | 41.0 | Male | 113154.0 | 66 | 8 | 36.0 | 741.410 | 6 |
| 2 | 27.0 | Male | 100292.0 | 44 | 3 | 26.5 | 651.420 | 7 |
| 3 | 35.0 | Male | 67039.0 | 61 | 8 | 30.0 | 149.070 | 8 |
| 4 | 52.0 | Male | 47990.0 | 31 | 6 | 28.0 | 711.040 | 9 |
| 5 | 54.5 | Male | 124733.0 | 43 | 4 | 26.5 | 238.530 | 10 |