R Markdown

This segmentation analysis is an attempt to find distinct groups for marketing purposes.

Read in the data file

# load data
assignment_data <- read_csv("e:/Portfolio Projects/Segmentation/Data/customer_segmentation.csv")

## Rows: 2240 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): Education, Marital_Status, Dt_Customer
## dbl (26): ID, Year_Birth, Income, Kidhome, Teenhome, Recency, MntWines, MntF...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

kable(assignment_data[1:5, 1:10], caption = "Examine first few rows")

Examine first few rows
ID	Year_Birth	Education	Marital_Status	Income	Kidhome	Teenhome	Dt_Customer	Recency	MntWines
5524	1957	Graduation	Single	58138	0	0	4/9/2012	58	635
2174	1954	Graduation	Single	46344	1	1	8/3/2014	38	11
4141	1965	Graduation	Together	71613	0	0	21-08-2013	26	426
6182	1984	Graduation	Together	26646	1	0	10/2/2014	26	11
5324	1981	PhD	Married	58293	1	0	19-01-2014	94	173

Explore labels in the data

# examine column labels
names(assignment_data)

##  [1] "ID"                  "Year_Birth"          "Education"          
##  [4] "Marital_Status"      "Income"              "Kidhome"            
##  [7] "Teenhome"            "Dt_Customer"         "Recency"            
## [10] "MntWines"            "MntFruits"           "MntMeatProducts"    
## [13] "MntFishProducts"     "MntSweetProducts"    "MntGoldProds"       
## [16] "NumDealsPurchases"   "NumWebPurchases"     "NumCatalogPurchases"
## [19] "NumStorePurchases"   "NumWebVisitsMonth"   "AcceptedCmp3"       
## [22] "AcceptedCmp4"        "AcceptedCmp5"        "AcceptedCmp1"       
## [25] "AcceptedCmp2"        "Complain"            "Z_CostContact"      
## [28] "Z_Revenue"           "Response"

Are there missing values in the segmentation variables?

# check variables for missing values
missing_seg_vals<- assignment_data %>% map(anyNA)
missings<- names(which(missing_seg_vals == TRUE))
missings

## [1] "Income"            "NumDealsPurchases"

# get a sense of how many rows have missing values
missing_eval<- select(assignment_data, contains(missings))
nrow(missing_eval[which(subset(is.na(missing_eval))==TRUE),])

## [1] 26

Prepare for the segmentation analysis

# select seg vars and store as new df
kmeans_df<- select(assignment_data, starts_with("Mnt")) %>% modify(as.numeric)
colnames(kmeans_df)

## [1] "MntWines"         "MntFruits"        "MntMeatProducts"  "MntFishProducts" 
## [5] "MntSweetProducts" "MntGoldProds"

# # median imputation for missing values # NOT NEEDED THIS TIME #
# kmeans_df_imputed <- kmeans_df %>%
#   mutate(across(everything(), ~ if_else(is.na(.), median(., na.rm = TRUE), .)))
kmeans_df_imputed <- kmeans_df

Segmentation

Scree plot to determine k clusters

# create plot of number of clusters vs total within sum of squares
scree.kmeans<- fviz_nbclust(kmeans_df_imputed, kmeans, method = "wss")
scree.kmeans

K-means

Implement k-means, k = 4

# make this example reproducible
set.seed(1988)

kmeans_obj<- kcca(kmeans_df_imputed, 4, family=kccaFamily("kmeans"), weights=NULL, group=NULL,
     control=NULL, simple=FALSE, save.data=FALSE)

# visualize clusters
prcomp_segs<- prcomp(kmeans_df_imputed)

prcomp_df<- as.data.frame(prcomp_segs$x[,1:2])

ggplot(prcomp_df, aes(x = PC1, y = PC2, color = factor(kmeans_obj@cluster))) +
  labs(color = "Segment") +
  geom_point()

Profiling

Subset the data to profile the demographic variables and Mnt (segmentation) variables separately

# seg_var profiling
assignment_data_profile_seg_var<- cbind(select(assignment_data, starts_with('Mnt')), clusters = kmeans_obj@cluster)
table(assignment_data_profile_seg_var$clusters)

## 
##    1    2    3    4 
##  200  554  277 1209

colnames(assignment_data_profile_seg_var)

## [1] "MntWines"         "MntFruits"        "MntMeatProducts"  "MntFishProducts" 
## [5] "MntSweetProducts" "MntGoldProds"     "clusters"

# d_var profiling

# create factors for Year_Birth, Income, Kidhome, and Teenhome

# create a new age factor variable
# check for entry errors or missing values
min(assignment_data$Year_Birth)

## [1] 1893

max(assignment_data$Year_Birth)

## [1] 1996

# create age variable
assignment_data$age<- 2024 - assignment_data$Year_Birth
min(assignment_data$age)

## [1] 28

max(assignment_data$age)

## [1] 131

# remove Year_Birth values for anyone over 100. Assuming ages over 100+ are entry errors
assignment_data$age<- replace(assignment_data$age, assignment_data$age > 100, NA)
min(assignment_data$age, na.rm = TRUE)

## [1] 28

max(assignment_data$age, na.rm = TRUE)

## [1] 84

breaks <- c(26, 35, 45, 55, 65, 100)
assignment_data <- assignment_data %>%
  mutate(age.factor = cut(age, breaks = breaks, labels = c("1: 26 to 35", "2: 36 to 45", "3: 46 to 55", "4: 56 to 65", "5: Over 65")))
table(assignment_data$age.factor)

## 
## 1: 26 to 35 2: 36 to 45 3: 46 to 55 4: 56 to 65  5: Over 65 
##          91         386         758         486         516

# create a new factor variable based on the ranges
# check for entry errors or missing values
min(assignment_data$Income)

## [1] NA

max(assignment_data$Income)

## [1] NA

# impute missing values with median
assignment_data$Income<- if_else(is.na(assignment_data$Income), median(assignment_data$Income, na.rm = TRUE), assignment_data$Income)
min(assignment_data$Income)

## [1] 1730

max(assignment_data$Income)

## [1] 666666

# 1730 for min and 666666 for max seems fishy but I'll leave them
descr(assignment_data$Income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1730   35539   51382   52238   68290  666666

breaks <- c(0, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, max(assignment_data$Income))
assignment_data <- assignment_data %>%
  mutate(income.factor = cut(Income, breaks = breaks, labels = c("1: $30,000 or less", "2: $30,001 to $40,000", "3: $40,001 to $50,000", "4: $50,001 to $60,000", "5: $60,001 to $70,000", "6: $70,001 to $80,000", "7: $80,001 to $90,000", "8: $90,001 to $100,000", "9: More than $100,000")))
table(assignment_data$income.factor)

## 
##     1: $30,000 or less  2: $30,001 to $40,000  3: $40,001 to $50,000 
##                    370                    362                    328 
##  4: $50,001 to $60,000  5: $60,001 to $70,000  6: $70,001 to $80,000 
##                    339                    333                    291 
##  7: $80,001 to $90,000 8: $90,001 to $100,000  9: More than $100,000 
##                    164                     40                     13

# create Kidhome and Teenhome factor variables
# check for entry errors or missing values
descr(assignment_data$Kidhome)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4442  1.0000  2.0000

descr(assignment_data$Teenhome)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.5062  1.0000  2.0000

assignment_data$kid.factor<- factor(assignment_data$Kidhome)
assignment_data$teen.factor<- factor(assignment_data$Teenhome)

assignment_data_profile_d_var<- cbind(select(assignment_data, starts_with(c("age.factor","Education","Marital_Status","income.factor","kid.factor","teen.factor"))), clusters = kmeans_obj@cluster)
table(assignment_data_profile_d_var$clusters)

## 
##    1    2    3    4 
##  200  554  277 1209

colnames(assignment_data_profile_d_var)

## [1] "age.factor"     "Education"      "Marital_Status" "income.factor" 
## [5] "kid.factor"     "teen.factor"    "clusters"

# append the clusters to kmeans df
kmeans_df_imputed$clusters<- assignment_data_profile_d_var$clusters
colnames(kmeans_df_imputed)

## [1] "MntWines"         "MntFruits"        "MntMeatProducts"  "MntFishProducts" 
## [5] "MntSweetProducts" "MntGoldProds"     "clusters"

# explore attribute importance for the clusters across the seg vars
set.seed(1988)
rand_frst <- randomForest(clusters ~ ., data = kmeans_df_imputed, ntree = 1000,
                          importance = TRUE, proximity = TRUE)

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

varImpPlot(rand_frst, sort = TRUE, main = "Variable Importance Plot")

Frequencies for seg vars

profile_seg_var<- assignment_data_profile_seg_var

seg_var_freqs_list<- list()

# loop through each column in the dataframe
for (col_name in colnames(profile_seg_var)) {
  # group by clusters count each d_ variable
  d_var_result <- profile_seg_var %>%
    group_by(clusters) %>%
    count(!!sym(col_name)) %>%
    mutate(pct = n / sum(n) * 100) %>%
    as.data.frame()

  # append the result to the list
  seg_var_freqs_list[[col_name]] <- d_var_result
}

# rename all seg_vars to plot easily, d_vars used in plotting function
seg_var_freqs_list <- map(seg_var_freqs_list, ~ .x %>%
                    rename(d_vars = names(.x)[2]))

# drop some d_elements that aren't suited for plotting
seg_var_freqs_list<- seg_var_freqs_list[names(seg_var_freqs_list) %in% "clusters" == FALSE]

# # verify removal
names(seg_var_freqs_list)

## [1] "MntWines"         "MntFruits"        "MntMeatProducts"  "MntFishProducts" 
## [5] "MntSweetProducts" "MntGoldProds"

Prepare for profiling visualizations

d_var_freqs_list<- list()

# loop through each column in the dataframe
for (col_name in colnames(assignment_data_profile_d_var)) {
  # group by clusters count each d_ variable
  d_var_result <- assignment_data_profile_d_var %>%
    group_by(clusters) %>%
    count(!!sym(col_name)) %>%
    mutate(pct = n / sum(n) * 100) %>%
    as.data.frame()

  # append the result to the list
  d_var_freqs_list[[col_name]] <- d_var_result
}

# rename all d_vars to plot easily
d_var_freqs_list <- map(d_var_freqs_list, ~ .x %>%
                    rename(d_vars = names(.x)[2]))

# drop some d_elements that aren't suited for plotting
d_var_freqs_list_profiling<- d_var_freqs_list[names(d_var_freqs_list) %in% "clusters" == FALSE]

# verify removal
names(d_var_freqs_list_profiling)

## [1] "age.factor"     "Education"      "Marital_Status" "income.factor" 
## [5] "kid.factor"     "teen.factor"

Visualizations for profiling

Profiling with segmentation input variables to validate internal consistency

# create a function to generate a bar chart

create_bar_chart <- function(df) {ggplot(df, aes(x = factor(clusters), y = d_vars, fill = factor(clusters))) +
  geom_bar(stat = "identity") +
  labs(x = "Segment", y = "Spend", fill = "Segment") +
  theme_minimal()}

# create a list of plots for each dataframe
seg_var_plots_list <- map(seg_var_freqs_list, create_bar_chart)

seg_var_plots_labs <- names(seg_var_plots_list)
seg_var_plots<- map2(seg_var_plots_list, seg_var_plots_labs, ~ .x + labs(title = .y))

# view grids
seg_var_plots <- marrangeGrob(seg_var_plots, nrow=1, ncol=2)
seg_var_plots

Demographic profiling across segments

# function for demographic profiling plots
create_demo_bar_chart <- function(df) {
  ggplot(df, aes(x = clusters, y = pct, fill = d_vars)) +
    geom_bar(stat = "identity") +
    labs(title = "", x = "Segment", y = "Percentage") +
    guides(fill = guide_legend(title = "Levels"))
}

# create a list of plots for each dataframe
d_var_plots_list <- map(d_var_freqs_list_profiling, create_demo_bar_chart)

# set titles
d_var_plots<- map2(d_var_plots_list, names(d_var_freqs_list_profiling), ~ .x + labs(title = .y))

# view grids
d_var_plots <- marrangeGrob(d_var_plots, nrow=1, ncol=2)
d_var_plots

From the analysis we can see that segment 1 spends more than the others on meat and is a high spender in all categories other than wines and gold products. Segment 2 is a generally high spending group except for meats. Segment 3 spends the most on wine by far and has a medium spend in all other categories. Segment 4 is probably your average consumer, spending the least throughout all categories.

Demographics: Age and marital status aren’t particularly notable. Segments 1 and 4 have a similar age profile as do segments 2 and 3. Martial status is similar for all segments.

In terms of targeted marketing, segments 1 and 3 are clearly unique in there spending behavior.

Segment 1 (Meat eaters with money) are very high earners with no kids or teens in the household. They have spare money and they spend it on meat, although they also spend a significant amount in other categories.

Segment 3 (Educated winos) are highly educated, comprised mostly of people with advanced degrees and high earners who love to buy wine.

Segmentation is a extremely valuable tool for marketing. It allows a business to allocate valuable marketing resources efficiently, and provides a clear profile of current and potential customers.

Segmentation analysis for food and drink purchasing behavior