This segmentation analysis is an attempt to find distinct groups for marketing purposes.
# load data
assignment_data <- read_csv("e:/Portfolio Projects/Segmentation/Data/customer_segmentation.csv")
## Rows: 2240 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Education, Marital_Status, Dt_Customer
## dbl (26): ID, Year_Birth, Income, Kidhome, Teenhome, Recency, MntWines, MntF...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kable(assignment_data[1:5, 1:10], caption = "Examine first few rows")
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines |
|---|---|---|---|---|---|---|---|---|---|
| 5524 | 1957 | Graduation | Single | 58138 | 0 | 0 | 4/9/2012 | 58 | 635 |
| 2174 | 1954 | Graduation | Single | 46344 | 1 | 1 | 8/3/2014 | 38 | 11 |
| 4141 | 1965 | Graduation | Together | 71613 | 0 | 0 | 21-08-2013 | 26 | 426 |
| 6182 | 1984 | Graduation | Together | 26646 | 1 | 0 | 10/2/2014 | 26 | 11 |
| 5324 | 1981 | PhD | Married | 58293 | 1 | 0 | 19-01-2014 | 94 | 173 |
# examine column labels
names(assignment_data)
## [1] "ID" "Year_Birth" "Education"
## [4] "Marital_Status" "Income" "Kidhome"
## [7] "Teenhome" "Dt_Customer" "Recency"
## [10] "MntWines" "MntFruits" "MntMeatProducts"
## [13] "MntFishProducts" "MntSweetProducts" "MntGoldProds"
## [16] "NumDealsPurchases" "NumWebPurchases" "NumCatalogPurchases"
## [19] "NumStorePurchases" "NumWebVisitsMonth" "AcceptedCmp3"
## [22] "AcceptedCmp4" "AcceptedCmp5" "AcceptedCmp1"
## [25] "AcceptedCmp2" "Complain" "Z_CostContact"
## [28] "Z_Revenue" "Response"
# check variables for missing values
missing_seg_vals<- assignment_data %>% map(anyNA)
missings<- names(which(missing_seg_vals == TRUE))
missings
## [1] "Income" "NumDealsPurchases"
# get a sense of how many rows have missing values
missing_eval<- select(assignment_data, contains(missings))
nrow(missing_eval[which(subset(is.na(missing_eval))==TRUE),])
## [1] 26
# select seg vars and store as new df
kmeans_df<- select(assignment_data, starts_with("Mnt")) %>% modify(as.numeric)
colnames(kmeans_df)
## [1] "MntWines" "MntFruits" "MntMeatProducts" "MntFishProducts"
## [5] "MntSweetProducts" "MntGoldProds"
# # median imputation for missing values # NOT NEEDED THIS TIME #
# kmeans_df_imputed <- kmeans_df %>%
# mutate(across(everything(), ~ if_else(is.na(.), median(., na.rm = TRUE), .)))
kmeans_df_imputed <- kmeans_df
# create plot of number of clusters vs total within sum of squares
scree.kmeans<- fviz_nbclust(kmeans_df_imputed, kmeans, method = "wss")
scree.kmeans
# make this example reproducible
set.seed(1988)
kmeans_obj<- kcca(kmeans_df_imputed, 4, family=kccaFamily("kmeans"), weights=NULL, group=NULL,
control=NULL, simple=FALSE, save.data=FALSE)
# visualize clusters
prcomp_segs<- prcomp(kmeans_df_imputed)
prcomp_df<- as.data.frame(prcomp_segs$x[,1:2])
ggplot(prcomp_df, aes(x = PC1, y = PC2, color = factor(kmeans_obj@cluster))) +
labs(color = "Segment") +
geom_point()
# seg_var profiling
assignment_data_profile_seg_var<- cbind(select(assignment_data, starts_with('Mnt')), clusters = kmeans_obj@cluster)
table(assignment_data_profile_seg_var$clusters)
##
## 1 2 3 4
## 200 554 277 1209
colnames(assignment_data_profile_seg_var)
## [1] "MntWines" "MntFruits" "MntMeatProducts" "MntFishProducts"
## [5] "MntSweetProducts" "MntGoldProds" "clusters"
# d_var profiling
# create factors for Year_Birth, Income, Kidhome, and Teenhome
# create a new age factor variable
# check for entry errors or missing values
min(assignment_data$Year_Birth)
## [1] 1893
max(assignment_data$Year_Birth)
## [1] 1996
# create age variable
assignment_data$age<- 2024 - assignment_data$Year_Birth
min(assignment_data$age)
## [1] 28
max(assignment_data$age)
## [1] 131
# remove Year_Birth values for anyone over 100. Assuming ages over 100+ are entry errors
assignment_data$age<- replace(assignment_data$age, assignment_data$age > 100, NA)
min(assignment_data$age, na.rm = TRUE)
## [1] 28
max(assignment_data$age, na.rm = TRUE)
## [1] 84
breaks <- c(26, 35, 45, 55, 65, 100)
assignment_data <- assignment_data %>%
mutate(age.factor = cut(age, breaks = breaks, labels = c("1: 26 to 35", "2: 36 to 45", "3: 46 to 55", "4: 56 to 65", "5: Over 65")))
table(assignment_data$age.factor)
##
## 1: 26 to 35 2: 36 to 45 3: 46 to 55 4: 56 to 65 5: Over 65
## 91 386 758 486 516
# create a new factor variable based on the ranges
# check for entry errors or missing values
min(assignment_data$Income)
## [1] NA
max(assignment_data$Income)
## [1] NA
# impute missing values with median
assignment_data$Income<- if_else(is.na(assignment_data$Income), median(assignment_data$Income, na.rm = TRUE), assignment_data$Income)
min(assignment_data$Income)
## [1] 1730
max(assignment_data$Income)
## [1] 666666
# 1730 for min and 666666 for max seems fishy but I'll leave them
descr(assignment_data$Income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1730 35539 51382 52238 68290 666666
breaks <- c(0, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, max(assignment_data$Income))
assignment_data <- assignment_data %>%
mutate(income.factor = cut(Income, breaks = breaks, labels = c("1: $30,000 or less", "2: $30,001 to $40,000", "3: $40,001 to $50,000", "4: $50,001 to $60,000", "5: $60,001 to $70,000", "6: $70,001 to $80,000", "7: $80,001 to $90,000", "8: $90,001 to $100,000", "9: More than $100,000")))
table(assignment_data$income.factor)
##
## 1: $30,000 or less 2: $30,001 to $40,000 3: $40,001 to $50,000
## 370 362 328
## 4: $50,001 to $60,000 5: $60,001 to $70,000 6: $70,001 to $80,000
## 339 333 291
## 7: $80,001 to $90,000 8: $90,001 to $100,000 9: More than $100,000
## 164 40 13
# create Kidhome and Teenhome factor variables
# check for entry errors or missing values
descr(assignment_data$Kidhome)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.4442 1.0000 2.0000
descr(assignment_data$Teenhome)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.5062 1.0000 2.0000
assignment_data$kid.factor<- factor(assignment_data$Kidhome)
assignment_data$teen.factor<- factor(assignment_data$Teenhome)
assignment_data_profile_d_var<- cbind(select(assignment_data, starts_with(c("age.factor","Education","Marital_Status","income.factor","kid.factor","teen.factor"))), clusters = kmeans_obj@cluster)
table(assignment_data_profile_d_var$clusters)
##
## 1 2 3 4
## 200 554 277 1209
colnames(assignment_data_profile_d_var)
## [1] "age.factor" "Education" "Marital_Status" "income.factor"
## [5] "kid.factor" "teen.factor" "clusters"
# append the clusters to kmeans df
kmeans_df_imputed$clusters<- assignment_data_profile_d_var$clusters
colnames(kmeans_df_imputed)
## [1] "MntWines" "MntFruits" "MntMeatProducts" "MntFishProducts"
## [5] "MntSweetProducts" "MntGoldProds" "clusters"
# explore attribute importance for the clusters across the seg vars
set.seed(1988)
rand_frst <- randomForest(clusters ~ ., data = kmeans_df_imputed, ntree = 1000,
importance = TRUE, proximity = TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
varImpPlot(rand_frst, sort = TRUE, main = "Variable Importance Plot")
profile_seg_var<- assignment_data_profile_seg_var
seg_var_freqs_list<- list()
# loop through each column in the dataframe
for (col_name in colnames(profile_seg_var)) {
# group by clusters count each d_ variable
d_var_result <- profile_seg_var %>%
group_by(clusters) %>%
count(!!sym(col_name)) %>%
mutate(pct = n / sum(n) * 100) %>%
as.data.frame()
# append the result to the list
seg_var_freqs_list[[col_name]] <- d_var_result
}
# rename all seg_vars to plot easily, d_vars used in plotting function
seg_var_freqs_list <- map(seg_var_freqs_list, ~ .x %>%
rename(d_vars = names(.x)[2]))
# drop some d_elements that aren't suited for plotting
seg_var_freqs_list<- seg_var_freqs_list[names(seg_var_freqs_list) %in% "clusters" == FALSE]
# # verify removal
names(seg_var_freqs_list)
## [1] "MntWines" "MntFruits" "MntMeatProducts" "MntFishProducts"
## [5] "MntSweetProducts" "MntGoldProds"
d_var_freqs_list<- list()
# loop through each column in the dataframe
for (col_name in colnames(assignment_data_profile_d_var)) {
# group by clusters count each d_ variable
d_var_result <- assignment_data_profile_d_var %>%
group_by(clusters) %>%
count(!!sym(col_name)) %>%
mutate(pct = n / sum(n) * 100) %>%
as.data.frame()
# append the result to the list
d_var_freqs_list[[col_name]] <- d_var_result
}
# rename all d_vars to plot easily
d_var_freqs_list <- map(d_var_freqs_list, ~ .x %>%
rename(d_vars = names(.x)[2]))
# drop some d_elements that aren't suited for plotting
d_var_freqs_list_profiling<- d_var_freqs_list[names(d_var_freqs_list) %in% "clusters" == FALSE]
# verify removal
names(d_var_freqs_list_profiling)
## [1] "age.factor" "Education" "Marital_Status" "income.factor"
## [5] "kid.factor" "teen.factor"
# create a function to generate a bar chart
create_bar_chart <- function(df) {ggplot(df, aes(x = factor(clusters), y = d_vars, fill = factor(clusters))) +
geom_bar(stat = "identity") +
labs(x = "Segment", y = "Spend", fill = "Segment") +
theme_minimal()}
# create a list of plots for each dataframe
seg_var_plots_list <- map(seg_var_freqs_list, create_bar_chart)
seg_var_plots_labs <- names(seg_var_plots_list)
seg_var_plots<- map2(seg_var_plots_list, seg_var_plots_labs, ~ .x + labs(title = .y))
# view grids
seg_var_plots <- marrangeGrob(seg_var_plots, nrow=1, ncol=2)
seg_var_plots
# function for demographic profiling plots
create_demo_bar_chart <- function(df) {
ggplot(df, aes(x = clusters, y = pct, fill = d_vars)) +
geom_bar(stat = "identity") +
labs(title = "", x = "Segment", y = "Percentage") +
guides(fill = guide_legend(title = "Levels"))
}
# create a list of plots for each dataframe
d_var_plots_list <- map(d_var_freqs_list_profiling, create_demo_bar_chart)
# set titles
d_var_plots<- map2(d_var_plots_list, names(d_var_freqs_list_profiling), ~ .x + labs(title = .y))
# view grids
d_var_plots <- marrangeGrob(d_var_plots, nrow=1, ncol=2)
d_var_plots
From the analysis we can see that segment 1 spends more than the others on meat and is a high spender in all categories other than wines and gold products. Segment 2 is a generally high spending group except for meats. Segment 3 spends the most on wine by far and has a medium spend in all other categories. Segment 4 is probably your average consumer, spending the least throughout all categories.
Demographics: Age and marital status aren’t particularly notable. Segments 1 and 4 have a similar age profile as do segments 2 and 3. Martial status is similar for all segments.
In terms of targeted marketing, segments 1 and 3 are clearly unique in there spending behavior.
Segment 1 (Meat eaters with money) are very high earners with no kids or teens in the household. They have spare money and they spend it on meat, although they also spend a significant amount in other categories.
Segment 3 (Educated winos) are highly educated, comprised mostly of people with advanced degrees and high earners who love to buy wine.
Segmentation is a extremely valuable tool for marketing. It allows a business to allocate valuable marketing resources efficiently, and provides a clear profile of current and potential customers.