Load library packages

library(tidyverse)  # Multiple packages for data analysis

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(visdat)     # visualize missing values
library(mice)       # multiple imputation of missing values

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(knitr)      # for tidy table display
library(kableExtra)  # for html table display

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(GGally)     # for scatter plot matrix

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(ggmosaic)   # mosaic plot

## 
## Attaching package: 'ggmosaic'

## The following object is masked from 'package:GGally':
## 
##     happy

library(reshape2)   # data wrangling

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(ggforce)    # for parallet set plots

Data acquisition

Load the data into R workspace. The data can be downloaded from Kaggle.

# Set the working directory to where the data file is located
setwd("~/ml projects/customer segmentation")
cust_data = read.csv("marketing_campaign.csv", sep = "\t")

There are 29 columns (variables) and 2240 rows (observations). Each observation corresponds to a customer.

##  [1] "ID"                  "Year_Birth"          "Education"          
##  [4] "Marital_Status"      "Income"              "Kidhome"            
##  [7] "Teenhome"            "Dt_Customer"         "Recency"            
## [10] "MntWines"            "MntFruits"           "MntMeatProducts"    
## [13] "MntFishProducts"     "MntSweetProducts"    "MntGoldProds"       
## [16] "NumDealsPurchases"   "NumWebPurchases"     "NumCatalogPurchases"
## [19] "NumStorePurchases"   "NumWebVisitsMonth"   "AcceptedCmp3"       
## [22] "AcceptedCmp4"        "AcceptedCmp5"        "AcceptedCmp1"       
## [25] "AcceptedCmp2"        "Complain"            "Z_CostContact"      
## [28] "Z_Revenue"           "Response"

## 'data.frame':    2240 obs. of  29 variables:
##  $ ID                 : int  5524 2174 4141 6182 5324 7446 965 6177 4855 5899 ...
##  $ Year_Birth         : int  1957 1954 1965 1984 1981 1967 1971 1985 1974 1950 ...
##  $ Education          : chr  "Graduation" "Graduation" "Graduation" "Graduation" ...
##  $ Marital_Status     : chr  "Single" "Single" "Together" "Together" ...
##  $ Income             : int  58138 46344 71613 26646 58293 62513 55635 33454 30351 5648 ...
##  $ Kidhome            : int  0 1 0 1 1 0 0 1 1 1 ...
##  $ Teenhome           : int  0 1 0 0 0 1 1 0 0 1 ...
##  $ Dt_Customer        : chr  "04-09-2012" "08-03-2014" "21-08-2013" "10-02-2014" ...
##  $ Recency            : int  58 38 26 26 94 16 34 32 19 68 ...
##  $ MntWines           : int  635 11 426 11 173 520 235 76 14 28 ...
##  $ MntFruits          : int  88 1 49 4 43 42 65 10 0 0 ...
##  $ MntMeatProducts    : int  546 6 127 20 118 98 164 56 24 6 ...
##  $ MntFishProducts    : int  172 2 111 10 46 0 50 3 3 1 ...
##  $ MntSweetProducts   : int  88 1 21 3 27 42 49 1 3 1 ...
##  $ MntGoldProds       : int  88 6 42 5 15 14 27 23 2 13 ...
##  $ NumDealsPurchases  : int  3 2 1 2 5 2 4 2 1 1 ...
##  $ NumWebPurchases    : int  8 1 8 2 5 6 7 4 3 1 ...
##  $ NumCatalogPurchases: int  10 1 2 0 3 4 3 0 0 0 ...
##  $ NumStorePurchases  : int  4 2 10 4 6 10 7 4 2 0 ...
##  $ NumWebVisitsMonth  : int  7 5 4 6 5 6 6 8 9 20 ...
##  $ AcceptedCmp3       : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ AcceptedCmp4       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp5       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp1       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp2       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Complain           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Z_CostContact      : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Z_Revenue          : int  11 11 11 11 11 11 11 11 11 11 ...
##  $ Response           : int  1 0 0 0 0 0 0 0 1 0 ...

##        ID          Year_Birth    Education         Marital_Status    
##  Min.   :    0   Min.   :1893   Length:2240        Length:2240       
##  1st Qu.: 2828   1st Qu.:1959   Class :character   Class :character  
##  Median : 5458   Median :1970   Mode  :character   Mode  :character  
##  Mean   : 5592   Mean   :1969                                        
##  3rd Qu.: 8428   3rd Qu.:1977                                        
##  Max.   :11191   Max.   :1996                                        
##                                                                      
##      Income          Kidhome          Teenhome      Dt_Customer       
##  Min.   :  1730   Min.   :0.0000   Min.   :0.0000   Length:2240       
##  1st Qu.: 35303   1st Qu.:0.0000   1st Qu.:0.0000   Class :character  
##  Median : 51382   Median :0.0000   Median :0.0000   Mode  :character  
##  Mean   : 52247   Mean   :0.4442   Mean   :0.5062                     
##  3rd Qu.: 68522   3rd Qu.:1.0000   3rd Qu.:1.0000                     
##  Max.   :666666   Max.   :2.0000   Max.   :2.0000                     
##  NA's   :24                                                           
##     Recency         MntWines         MntFruits     MntMeatProducts 
##  Min.   : 0.00   Min.   :   0.00   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.:24.00   1st Qu.:  23.75   1st Qu.:  1.0   1st Qu.:  16.0  
##  Median :49.00   Median : 173.50   Median :  8.0   Median :  67.0  
##  Mean   :49.11   Mean   : 303.94   Mean   : 26.3   Mean   : 166.9  
##  3rd Qu.:74.00   3rd Qu.: 504.25   3rd Qu.: 33.0   3rd Qu.: 232.0  
##  Max.   :99.00   Max.   :1493.00   Max.   :199.0   Max.   :1725.0  
##                                                                    
##  MntFishProducts  MntSweetProducts  MntGoldProds    NumDealsPurchases
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   : 0.000   
##  1st Qu.:  3.00   1st Qu.:  1.00   1st Qu.:  9.00   1st Qu.: 1.000   
##  Median : 12.00   Median :  8.00   Median : 24.00   Median : 2.000   
##  Mean   : 37.53   Mean   : 27.06   Mean   : 44.02   Mean   : 2.325   
##  3rd Qu.: 50.00   3rd Qu.: 33.00   3rd Qu.: 56.00   3rd Qu.: 3.000   
##  Max.   :259.00   Max.   :263.00   Max.   :362.00   Max.   :15.000   
##                                                                      
##  NumWebPurchases  NumCatalogPurchases NumStorePurchases NumWebVisitsMonth
##  Min.   : 0.000   Min.   : 0.000      Min.   : 0.00     Min.   : 0.000   
##  1st Qu.: 2.000   1st Qu.: 0.000      1st Qu.: 3.00     1st Qu.: 3.000   
##  Median : 4.000   Median : 2.000      Median : 5.00     Median : 6.000   
##  Mean   : 4.085   Mean   : 2.662      Mean   : 5.79     Mean   : 5.317   
##  3rd Qu.: 6.000   3rd Qu.: 4.000      3rd Qu.: 8.00     3rd Qu.: 7.000   
##  Max.   :27.000   Max.   :28.000      Max.   :13.00     Max.   :20.000   
##                                                                          
##   AcceptedCmp3      AcceptedCmp4      AcceptedCmp5      AcceptedCmp1    
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.07277   Mean   :0.07455   Mean   :0.07277   Mean   :0.06429  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##                                                                         
##   AcceptedCmp2        Complain        Z_CostContact   Z_Revenue 
##  Min.   :0.00000   Min.   :0.000000   Min.   :3     Min.   :11  
##  1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:3     1st Qu.:11  
##  Median :0.00000   Median :0.000000   Median :3     Median :11  
##  Mean   :0.01339   Mean   :0.009375   Mean   :3     Mean   :11  
##  3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:3     3rd Qu.:11  
##  Max.   :1.00000   Max.   :1.000000   Max.   :3     Max.   :11  
##                                                                 
##     Response     
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.1491  
##  3rd Qu.:0.0000  
##  Max.   :1.0000  
##

From the output of summary() function, it is noted that there are missing values for the Income attribute. Aside from that, there are some other things to keep in mind: 1. Feature like ID is probably not useful as it does not encode any information about a customer. I will keep it in the DataFrame variable but not using it for any analysis. 2. Feature like Z_CostContact and Z_Revenue are not useful attributes as they have zero variance.

## variance of  Z_CostContact  is:  0

## variance of  Z_Revenue  is:  0

Some feature engineering is required for some other features like Dt_Customer and Year_Birth.
Feature restructuring from character to factor for features such as Education and Marital Status.

Feature engineering

Removal of zero variance variables

cust_data_copy = cust_data
cust_data = subset(cust_data, select = -c(Z_CostContact, Z_Revenue))

Get rid of invalid observations I

table(cust_data$Marital_Status)

## 
##   Absurd    Alone Divorced  Married   Single Together    Widow     YOLO 
##        2        3      232      864      480      580       77        2

knitr::kable(cust_data %>% filter(Marital_Status == "YOLO"), "html", caption = "YOLO instances") %>%  kable_styling("striped") %>% scroll_box(width = "100%")

YOLO instances
ID	Year_Birth	Education	Marital_Status	Income	Kidhome	Teenhome	Dt_Customer	Recency	MntWines	MntFruits	MntMeatProducts	MntFishProducts	MntSweetProducts	MntGoldProds	NumDealsPurchases	NumWebPurchases	NumCatalogPurchases	NumStorePurchases	NumWebVisitsMonth	AcceptedCmp3	AcceptedCmp4	AcceptedCmp5	AcceptedCmp1	AcceptedCmp2	Complain	Response
492	1973	PhD	YOLO	48432	0	1	18-10-2012	3	322	3	50	4	3	42	5	7	1	6	8	0	0	0	0	0	0	0
11133	1973	PhD	YOLO	48432	0	1	18-10-2012	3	322	3	50	4	3	42	5	7	1	6	8	0	0	0	0	0	0	1

cust_data = cust_data %>% filter(Marital_Status != "YOLO")

Feature extraction I

Get the minimum number of household members from attributes Marital_Status, Kidhome and Teenhome.
Get the total number of accepted offers.

# Calculate total number of accepted campaign, minimum number of household members.

min_num_member = function(x) {
  if (x[1]=="Married" || x[1]=="Together"){
    np = 2
  } else {
    np = 1
  }
  return(as.numeric(x[2]) + as.numeric(x[3]) + np)
}
min_num_household = apply(cust_data %>% 
                            select(Marital_Status, Kidhome, Teenhome), 
          1, min_num_member)
cust_data$min_num_household = min_num_household

cust_data = cust_data %>% mutate(tot_AcceptedCmp = AcceptedCmp1 + AcceptedCmp2 + AcceptedCmp3 + AcceptedCmp4 + AcceptedCmp5 + Response)

Restructuring of data types

# Change the date from string to date format
cust_data$Dt_Customer = as.Date(cust_data$Dt_Customer, 
                                     format = c("%d-%m-%Y"))
# cust_data %>% as_tibble() %>% glimpse()
# Change the categorical features from character to factor
categorical_features = c("Education", "Marital_Status", "AcceptedCmp1",
                         "AcceptedCmp2", "AcceptedCmp3", "AcceptedCmp4",
                         "AcceptedCmp5", "Complain", "Response")
for (i in 1:length(categorical_features)){
  cust_data[[categorical_features[i]]] = 
    as.factor(cust_data[[categorical_features[i]]])
}
# ordered categorical variables: Kidhome and Teenhome
cust_data$Kidhome = factor(cust_data$Kidhome, 
                           levels = seq(from = min(cust_data$Kidhome), 
                                        to = max(cust_data$Kidhome), 
                                        by = 1), ordered = TRUE)
cust_data$Teenhome = factor(cust_data$Teenhome, 
                           levels = seq(from = min(cust_data$Teenhome), 
                                        to = max(cust_data$Teenhome), 
                                        by = 1), ordered = TRUE)

Feature extraction II

Change the attribute Year_Birth to age and change the Dt_Customer to number of days a customer is enrolled with the company.

# Change Year_Birth to age
current_year = 2014
cust_data = cust_data %>% mutate(age = current_year - Year_Birth) %>% 
  subset(select = -c(Year_Birth))

# Change Dt_customer to days of enrollment. Assume that the current date is 1st July 2014
current_date = as.Date("2014-07-01")
cust_data = cust_data %>% 
  mutate(days_enroll = difftime(current_date, Dt_Customer, units = c("days")) %>% 
           as.numeric()) %>% 
  subset(select = -c(Dt_Customer))

Get rid of outliers

summary(cust_data$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    18.0    37.0    44.0    45.2    55.0   121.0

cust_data = cust_data %>% filter(age < 100)

cust_data %>% arrange(desc(Income)) %>% head(3) %>% kable("html", caption = "Income in descending order") %>% kable_styling("striped") %>% scroll_box(width = "100%")

Income in descending order
ID	Education	Marital_Status	Income	Kidhome	Teenhome	Recency	MntWines	MntFruits	MntMeatProducts	MntFishProducts	MntSweetProducts	MntGoldProds	NumDealsPurchases	NumWebPurchases	NumCatalogPurchases	NumStorePurchases	NumWebVisitsMonth	min_num_household	age	days_enroll
9432	Graduation	Together	666666	1	0	23	9	14	18	8	1	12	4	3	1	3	6	3	37	394
1503	PhD	Together	162397	1	1	31	85	1	16	2	1	2	0	0	0	1	1	4	38	393
1501	PhD	Married	160803	0	0	21	55	16	1622	17	3	4	15	0	28	1	0	2	32	696

cust_data = cust_data %>% filter(ID!=9432)

Deal with missing values

There are 3 types of missing values:
1. missing completely at random (MCAR) 2. missing at random (MAR) 3. missing not at random (MNAR). Refer to this awesome article for more info on each types of missing values. There are several strategies to choose from when we encounter missing values in the data:
1. Complete case analysis (ignore observation(s) with missing values).
2. Single imputation - mean, median, regression model.
3. Multiple imputation - leverage distribution of observed data to estimate the missing values.
Multiple imputation is generally superior to single imputation because the missing values are calculated multiple times with many different plausible values being pooled together to get the estimates. For this experiment, Multiple Imputation by Chained Equations (MICE) is implemented to obtain the estimation of missing Income instances.

# Write a utility function to check for missing values.
check_missing_values = function(cust_data){
  vec_missing = apply(cust_data, 2, function(x){ sum(any(is.na(x)))})
  idx_missing = which(as.vector(vec_missing)==1)
  if (length(idx_missing)>0){
    cat("The columns(features) with missing values: ", 
      colnames(cust_data)[idx_missing])
    
  } else {
    print("No missing values")
  }
  
}
check_missing_values(cust_data)

## The columns(features) with missing values:  Income

vis_miss(cust_data)

cat(" \nThe number of missing values for income: ", sum(is.na(cust_data$Income)))

##  
## The number of missing values for income:  24

idx_missing = which(is.na(cust_data$Income))

Perform multiple imputation.

# MICE implementation
imp = mice(cust_data, maxit = 0, print = F)
pred = imp$predictorMatrix
pred[,"ID"] = 0
# Use the default value of 5 iterations
imp = mice(cust_data, seed = 10, predictorMatrix = pred, print = F)
stripplot(imp, Income~.imp, pch = 20, cex = 2)

cust_data = complete(imp, 5)
# Save the preprocessed data for further analysis
saveRDS(cust_data, file = "preprocessed_cust_data.rds")

This wraps up the preprocessing of the data. Lets take a look at the results of missing Income imputation

Missing values imputed customer data
	ID	Education	Marital_Status	Income
11	1994	Graduation	Married	8028
28	5255	Graduation	Single	4023
44	7281	PhD	Single	8028
49	7244	Graduation	Single	4023
59	8557	Graduation	Single	5648

Data visualization

Data visualization is graphical representation of information and data.

Histogram and density plot

Lets visualize the household income. There is/are customer(s) with unusually high yearly household income based on empirical observation on the scale of x axis of the histogram plot.

#hist_data = cust_data %>% ggplot(aes(x = Income)) + 
#  geom_histogram(binwidth = 10000, boundary = 0)

cust_data %>% ggplot(aes(x = Income)) + 
  geom_histogram(aes(y = stat(count)/sum(count)), binwidth = 10000, 
                 boundary = 0) +
  ylab("Relative frequency")

# density plot
cust_data %>% ggplot(aes(x = Income)) + geom_density(fill = "lightblue", alpha = 0.5) +
  geom_vline(aes(xintercept=mean(Income)), color = "blue", linetype = 2) + geom_vline(aes(xintercept=median(Income)), color = "red", linetype = 3) + xlim(0, 2e+05) + theme_classic() + ggtitle("density plot for customer annual household income")

cust_data %>% ggplot(aes(x = Income)) + 
  geom_histogram(aes(y = stat(count)/sum(count)), binwidth = 0.2, 
                 boundary = 0) +
  scale_x_log10() +
  ylab("Relative frequency") +
  ggtitle("Distribution of income in log10 scales")

Most of the customers (99.5 %) have income of equal or less than 100k. In fact, (90.91 %) of the customers have annual household income within 20k-90k.

Scatter plots among continuous predictors

# Linear correlation and scatter plots matrix for amount spent on different merchandise
idx_cont_features = grepl("Mnt", colnames(cust_data))
cust_data[, idx_cont_features] %>% ggpairs(title = "correlogram")

# Linear correlation and scatter plots matrix for number of purchases.
idx_cont_features = grepl("Num", colnames(cust_data))
cust_data[, idx_cont_features] %>% ggpairs(title = "correlogram")

Correlation matrix / Correlogram

Linear correlation between numerical features.

seq = c("Mnt", "Num","Age", "days", "Income", "Recency", "household","tot")
a = sapply(seq, function(x) grep(x, colnames(cust_data)))

idx = c()
for (i in (1:length(seq))){
  idx = c(idx, a[[i]])
}

corr_mat = cor(cust_data[,idx])
corr_mat[upper.tri(corr_mat)] = NA
melted_cormat = melt(corr_mat, na.rm = TRUE)
# The dimension becomes nx3 matrix

melted_cormat %>% ggplot(aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") + 
  scale_fill_gradientn(limit = c(-1,1), colors = hcl.colors(40, "Earth")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1)) + coord_fixed() +
  geom_text(aes(Var1, Var2, label = round(value, 2)), color = "black", size = 1.5)

Mosaic Plot

Response rate from different groups of population.

cust_data %>% ggplot() +
  geom_mosaic(aes(x = product(Marital_Status, Education), fill = Response)) +
  theme(axis.text.x = element_text(face = "bold", angle = 90),
        axis.text.y = element_text(face = "bold")) +
  ggtitle("Mosaic Plot of Response, Marital status and Education")

cust_data %>% ggplot() +
  geom_mosaic(aes(x = product(Kidhome, Teenhome), fill = Response)) +
  ggtitle("Mosaic Plot of Response, Kidhome and Teenhome")

Parallel set plots

data_group = cust_data %>% group_by(Education, Marital_Status, Kidhome, 
                                    Teenhome, Response) %>% 
  summarise(summ = n())

## `summarise()` has grouped output by 'Education', 'Marital_Status', 'Kidhome',
## 'Teenhome'. You can override using the `.groups` argument.

data_parallel = reshape2::melt(data_group)

## Using Education, Marital_Status, Kidhome, Teenhome, Response as id variables

data_parallel = gather_set_data(data_parallel, 1:5)
ggplot(data_parallel, aes(x, id = id, split = y, value = value)) +
  geom_parallel_sets(aes(fill = Response), alpha = 0.3, axis.width = 0.1) +
  geom_parallel_sets_axes(axis.width = 0.1) +
  geom_parallel_sets_labels(colour = 'white')

There is virtually no chance for the customers with 2 children or 2 teenagers in their household to accept the offer.

Grouped box plots / violin plots

Based on minimum number of household members, marital status and education.

Visualize income

Distribution of income for different categories of customers.

seq = c("Income", "Education", "Marital", "min")
idx_plot = sapply(seq, function(x) grep(x, colnames(cust_data)))

idx = c()
for (i in (1:length(seq))){
  idx = c(idx, idx_plot[[i]])
}
cust_data %>% dplyr::select(idx) %>% 
  ggplot(aes(x = Education, y = Income, fill = Education)) +
  geom_violin(scale = "count", draw_quantiles = c(0.25, 0.5, 0.75)) + stat_summary(fun = mean, geom = "point", shape=21, size=1.5, color = "red") + ylim(0, 2e+05)

## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(idx)` instead of `idx` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

cust_data %>% dplyr::select(idx) %>% dplyr::filter(!(Marital_Status %in% c("Alone", "Absurd"))) %>% 
  ggplot(aes(x = Marital_Status, y = Income, fill = Marital_Status)) +
  geom_violin(scale = "count", draw_quantiles = c(0.25, 0.5, 0.75)) + stat_summary(fun = mean, geom = "point", shape=21, size=1.5, color = "red") + ylim(0, 2e+05) + ggtitle("Distribution of household income based on marital status")

cust_data %>% dplyr::select(idx) %>% mutate(num_household = as.factor(min_num_household)) %>%
  ggplot(aes(x = num_household, y = Income, fill = num_household)) +
  geom_violin(scale = "count", draw_quantiles = c(0.25, 0.5, 0.75)) + stat_summary(fun = mean, geom = "point", shape=21, size=1.5, color = "red") + ylim(0, 2e+05) + labs(title = "Distribution of household income based on minimum number of household members", x = "minimum number of household members")

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

Visualize expenditure

Show expenditure of different types of products.

seq = c("Mnt", "Education", "Marital", "min")
idx_plot = sapply(seq, function(x) grep(x, colnames(cust_data)))

idx = c()
for (i in (1:length(seq))){
  idx = c(idx, idx_plot[[i]])
}
cust_data %>% dplyr::select(idx) %>% 
  pivot_longer(starts_with("Mnt"), names_to = "purchase_types",
               values_to = "amount_spent") %>% 
  ggplot(aes(x = Education, y = amount_spent)) +
  geom_boxplot(fill = "chocolate2", notch = TRUE) + stat_summary(fun = mean, geom = "point", shape=21, size=1.5, color = "red") + theme_classic() + facet_wrap(~purchase_types, ncol = 2, scales = "free_y")

cust_data %>% dplyr::filter(!(Marital_Status %in% c("Alone", "Absurd"))) %>% dplyr::select(idx) %>% 
  pivot_longer(starts_with("Mnt"), names_to = "purchase_types",
               values_to = "amount_spent") %>% 
  ggplot(aes(x = Marital_Status, y = amount_spent)) +
  geom_boxplot(fill = "chocolate2", notch = TRUE) + stat_summary(fun = mean, geom = "point", shape=21, size=1.5, color = "red") + theme_classic() + facet_wrap(~purchase_types, ncol = 2, scales = "free_y")

## notch went outside hinges. Try setting notch=FALSE.

cust_data %>% dplyr::select(idx) %>% mutate(num_household = as.factor(min_num_household)) %>% 
  pivot_longer(starts_with("Mnt"), names_to = "purchase_types",
               values_to = "amount_spent") %>% 
  ggplot(aes(x = num_household, y = amount_spent)) +
  geom_boxplot(fill = "chocolate2", notch = TRUE) + stat_summary(fun = mean, geom = "point", shape=21, size=1.5, color = "red") + theme_classic() + facet_wrap(~purchase_types, ncol = 2, scales = "free_y")

## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

##                _                           
## platform       x86_64-w64-mingw32          
## arch           x86_64                      
## os             mingw32                     
## system         x86_64, mingw32             
## status                                     
## major          4                           
## minor          1.2                         
## year           2021                        
## month          11                          
## day            01                          
## svn rev        81115                       
## language       R                           
## version.string R version 4.1.2 (2021-11-01)
## nickname       Bird Hippie

Customer personality analysis: Data visualization

Lim Jia Qi

5/24/2022