R Markdown

This segmentation analysis is an attempt to find distinct groups for marketing purposes.

Read in the data file

# load data
assignment_data <- read_csv("e:/Portfolio Projects/Segmentation/Data/customer_segmentation.csv")
## Rows: 2240 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): Education, Marital_Status, Dt_Customer
## dbl (26): ID, Year_Birth, Income, Kidhome, Teenhome, Recency, MntWines, MntF...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kable(assignment_data[1:5, 1:10], caption = "Examine first few rows")
Examine first few rows
ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines
5524 1957 Graduation Single 58138 0 0 4/9/2012 58 635
2174 1954 Graduation Single 46344 1 1 8/3/2014 38 11
4141 1965 Graduation Together 71613 0 0 21-08-2013 26 426
6182 1984 Graduation Together 26646 1 0 10/2/2014 26 11
5324 1981 PhD Married 58293 1 0 19-01-2014 94 173

Explore labels in the data

# examine column labels
names(assignment_data)
##  [1] "ID"                  "Year_Birth"          "Education"          
##  [4] "Marital_Status"      "Income"              "Kidhome"            
##  [7] "Teenhome"            "Dt_Customer"         "Recency"            
## [10] "MntWines"            "MntFruits"           "MntMeatProducts"    
## [13] "MntFishProducts"     "MntSweetProducts"    "MntGoldProds"       
## [16] "NumDealsPurchases"   "NumWebPurchases"     "NumCatalogPurchases"
## [19] "NumStorePurchases"   "NumWebVisitsMonth"   "AcceptedCmp3"       
## [22] "AcceptedCmp4"        "AcceptedCmp5"        "AcceptedCmp1"       
## [25] "AcceptedCmp2"        "Complain"            "Z_CostContact"      
## [28] "Z_Revenue"           "Response"

Are there missing values in the segmentation variables?

# check variables for missing values
missing_seg_vals<- assignment_data %>% map(anyNA)
missings<- names(which(missing_seg_vals == TRUE))
missings
## [1] "Income"            "NumDealsPurchases"
# get a sense of how many rows have missing values
missing_eval<- select(assignment_data, contains(missings))
nrow(missing_eval[which(subset(is.na(missing_eval))==TRUE),])
## [1] 26

Prepare for the segmentation analysis

# select seg vars and store as new df
kmeans_df<- select(assignment_data, starts_with("Mnt")) %>% modify(as.numeric)
colnames(kmeans_df)
## [1] "MntWines"         "MntFruits"        "MntMeatProducts"  "MntFishProducts" 
## [5] "MntSweetProducts" "MntGoldProds"
# # median imputation for missing values # NOT NEEDED THIS TIME #
# kmeans_df_imputed <- kmeans_df %>%
#   mutate(across(everything(), ~ if_else(is.na(.), median(., na.rm = TRUE), .)))
kmeans_df_imputed <- kmeans_df

Segmentation

Scree plot to determine k clusters

# create plot of number of clusters vs total within sum of squares
scree.kmeans<- fviz_nbclust(kmeans_df_imputed, kmeans, method = "wss")
scree.kmeans

K-means

Implement k-means, k = 4

# make this example reproducible
set.seed(1988)

kmeans_obj<- kcca(kmeans_df_imputed, 4, family=kccaFamily("kmeans"), weights=NULL, group=NULL,
     control=NULL, simple=FALSE, save.data=FALSE)

# visualize clusters
prcomp_segs<- prcomp(kmeans_df_imputed)

prcomp_df<- as.data.frame(prcomp_segs$x[,1:2])

ggplot(prcomp_df, aes(x = PC1, y = PC2, color = factor(kmeans_obj@cluster))) +
  labs(color = "Segment") +
  geom_point()

Profiling

Subset the data to profile the demographic variables and Mnt (segmentation) variables separately

# seg_var profiling
assignment_data_profile_seg_var<- cbind(select(assignment_data, starts_with('Mnt')), clusters = kmeans_obj@cluster)
table(assignment_data_profile_seg_var$clusters)
## 
##    1    2    3    4 
##  200  554  277 1209
colnames(assignment_data_profile_seg_var)
## [1] "MntWines"         "MntFruits"        "MntMeatProducts"  "MntFishProducts" 
## [5] "MntSweetProducts" "MntGoldProds"     "clusters"
# d_var profiling

# create factors for Year_Birth, Income, Kidhome, and Teenhome

# create a new age factor variable
# check for entry errors or missing values
min(assignment_data$Year_Birth)
## [1] 1893
max(assignment_data$Year_Birth)
## [1] 1996
# create age variable
assignment_data$age<- 2024 - assignment_data$Year_Birth
min(assignment_data$age)
## [1] 28
max(assignment_data$age)
## [1] 131
# remove Year_Birth values for anyone over 100. Assuming ages over 100+ are entry errors
assignment_data$age<- replace(assignment_data$age, assignment_data$age > 100, NA)
min(assignment_data$age, na.rm = TRUE)
## [1] 28
max(assignment_data$age, na.rm = TRUE)
## [1] 84
breaks <- c(26, 35, 45, 55, 65, 100)
assignment_data <- assignment_data %>%
  mutate(age.factor = cut(age, breaks = breaks, labels = c("1: 26 to 35", "2: 36 to 45", "3: 46 to 55", "4: 56 to 65", "5: Over 65")))
table(assignment_data$age.factor)
## 
## 1: 26 to 35 2: 36 to 45 3: 46 to 55 4: 56 to 65  5: Over 65 
##          91         386         758         486         516
# create a new factor variable based on the ranges
# check for entry errors or missing values
min(assignment_data$Income)
## [1] NA
max(assignment_data$Income)
## [1] NA
# impute missing values with median
assignment_data$Income<- if_else(is.na(assignment_data$Income), median(assignment_data$Income, na.rm = TRUE), assignment_data$Income)
min(assignment_data$Income)
## [1] 1730
max(assignment_data$Income)
## [1] 666666
# 1730 for min and 666666 for max seems fishy but I'll leave them
descr(assignment_data$Income)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1730   35539   51382   52238   68290  666666
breaks <- c(0, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, max(assignment_data$Income))
assignment_data <- assignment_data %>%
  mutate(income.factor = cut(Income, breaks = breaks, labels = c("1: $30,000 or less", "2: $30,001 to $40,000", "3: $40,001 to $50,000", "4: $50,001 to $60,000", "5: $60,001 to $70,000", "6: $70,001 to $80,000", "7: $80,001 to $90,000", "8: $90,001 to $100,000", "9: More than $100,000")))
table(assignment_data$income.factor)
## 
##     1: $30,000 or less  2: $30,001 to $40,000  3: $40,001 to $50,000 
##                    370                    362                    328 
##  4: $50,001 to $60,000  5: $60,001 to $70,000  6: $70,001 to $80,000 
##                    339                    333                    291 
##  7: $80,001 to $90,000 8: $90,001 to $100,000  9: More than $100,000 
##                    164                     40                     13
# create Kidhome and Teenhome factor variables
# check for entry errors or missing values
descr(assignment_data$Kidhome)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4442  1.0000  2.0000
descr(assignment_data$Teenhome)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.5062  1.0000  2.0000
assignment_data$kid.factor<- factor(assignment_data$Kidhome)
assignment_data$teen.factor<- factor(assignment_data$Teenhome)

assignment_data_profile_d_var<- cbind(select(assignment_data, starts_with(c("age.factor","Education","Marital_Status","income.factor","kid.factor","teen.factor"))), clusters = kmeans_obj@cluster)
table(assignment_data_profile_d_var$clusters)
## 
##    1    2    3    4 
##  200  554  277 1209
colnames(assignment_data_profile_d_var)
## [1] "age.factor"     "Education"      "Marital_Status" "income.factor" 
## [5] "kid.factor"     "teen.factor"    "clusters"
# append the clusters to kmeans df
kmeans_df_imputed$clusters<- assignment_data_profile_d_var$clusters
colnames(kmeans_df_imputed)
## [1] "MntWines"         "MntFruits"        "MntMeatProducts"  "MntFishProducts" 
## [5] "MntSweetProducts" "MntGoldProds"     "clusters"
# explore attribute importance for the clusters across the seg vars
set.seed(1988)
rand_frst <- randomForest(clusters ~ ., data = kmeans_df_imputed, ntree = 1000,
                          importance = TRUE, proximity = TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
varImpPlot(rand_frst, sort = TRUE, main = "Variable Importance Plot")

Frequencies for seg vars

profile_seg_var<- assignment_data_profile_seg_var

seg_var_freqs_list<- list()

# loop through each column in the dataframe
for (col_name in colnames(profile_seg_var)) {
  # group by clusters count each d_ variable
  d_var_result <- profile_seg_var %>%
    group_by(clusters) %>%
    count(!!sym(col_name)) %>%
    mutate(pct = n / sum(n) * 100) %>%
    as.data.frame()

  # append the result to the list
  seg_var_freqs_list[[col_name]] <- d_var_result
}

# rename all seg_vars to plot easily, d_vars used in plotting function
seg_var_freqs_list <- map(seg_var_freqs_list, ~ .x %>%
                    rename(d_vars = names(.x)[2]))

# drop some d_elements that aren't suited for plotting
seg_var_freqs_list<- seg_var_freqs_list[names(seg_var_freqs_list) %in% "clusters" == FALSE]

# # verify removal
names(seg_var_freqs_list)
## [1] "MntWines"         "MntFruits"        "MntMeatProducts"  "MntFishProducts" 
## [5] "MntSweetProducts" "MntGoldProds"

Prepare for profiling visualizations

d_var_freqs_list<- list()

# loop through each column in the dataframe
for (col_name in colnames(assignment_data_profile_d_var)) {
  # group by clusters count each d_ variable
  d_var_result <- assignment_data_profile_d_var %>%
    group_by(clusters) %>%
    count(!!sym(col_name)) %>%
    mutate(pct = n / sum(n) * 100) %>%
    as.data.frame()

  # append the result to the list
  d_var_freqs_list[[col_name]] <- d_var_result
}

# rename all d_vars to plot easily
d_var_freqs_list <- map(d_var_freqs_list, ~ .x %>%
                    rename(d_vars = names(.x)[2]))

# drop some d_elements that aren't suited for plotting
d_var_freqs_list_profiling<- d_var_freqs_list[names(d_var_freqs_list) %in% "clusters" == FALSE]

# verify removal
names(d_var_freqs_list_profiling)
## [1] "age.factor"     "Education"      "Marital_Status" "income.factor" 
## [5] "kid.factor"     "teen.factor"

Visualizations for profiling

Profiling with segmentation input variables to validate internal consistency

# create a function to generate a bar chart

create_bar_chart <- function(df) {ggplot(df, aes(x = factor(clusters), y = d_vars, fill = factor(clusters))) +
  geom_bar(stat = "identity") +
  labs(x = "Segment", y = "Spend", fill = "Segment") +
  theme_minimal()}

# create a list of plots for each dataframe
seg_var_plots_list <- map(seg_var_freqs_list, create_bar_chart)

seg_var_plots_labs <- names(seg_var_plots_list)
seg_var_plots<- map2(seg_var_plots_list, seg_var_plots_labs, ~ .x + labs(title = .y))

# view grids
seg_var_plots <- marrangeGrob(seg_var_plots, nrow=1, ncol=2)
seg_var_plots

Demographic profiling across segments

# function for demographic profiling plots
create_demo_bar_chart <- function(df) {
  ggplot(df, aes(x = clusters, y = pct, fill = d_vars)) +
    geom_bar(stat = "identity") +
    labs(title = "", x = "Segment", y = "Percentage") +
    guides(fill = guide_legend(title = "Levels"))
}

# create a list of plots for each dataframe
d_var_plots_list <- map(d_var_freqs_list_profiling, create_demo_bar_chart)

# set titles
d_var_plots<- map2(d_var_plots_list, names(d_var_freqs_list_profiling), ~ .x + labs(title = .y))

# view grids
d_var_plots <- marrangeGrob(d_var_plots, nrow=1, ncol=2)
d_var_plots

From the analysis we can see that segment 1 spends more than the others on meat and is a high spender in all categories other than wines and gold products. Segment 2 is a generally high spending group except for meats. Segment 3 spends the most on wine by far and has a medium spend in all other categories. Segment 4 is probably your average consumer, spending the least throughout all categories.

Demographics: Age and marital status aren’t particularly notable. Segments 1 and 4 have a similar age profile as do segments 2 and 3. Martial status is similar for all segments.

In terms of targeted marketing, segments 1 and 3 are clearly unique in there spending behavior.

Segment 1 (Meat eaters with money) are very high earners with no kids or teens in the household. They have spare money and they spend it on meat, although they also spend a significant amount in other categories.

Segment 3 (Educated winos) are highly educated, comprised mostly of people with advanced degrees and high earners who love to buy wine.

Segmentation is a extremely valuable tool for marketing. It allows a business to allocate valuable marketing resources efficiently, and provides a clear profile of current and potential customers.