Load library packages
library(tidyverse) # Multiple packages for data analysis## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(visdat) # visualize missing values
library(mice) # multiple imputation of missing values##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(knitr) # for tidy table display
library(kableExtra) # for html table display##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(GGally) # for scatter plot matrix## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggmosaic) # mosaic plot##
## Attaching package: 'ggmosaic'
## The following object is masked from 'package:GGally':
##
## happy
library(reshape2) # data wrangling##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggforce) # for parallet set plotsData acquisition
Load the data into R workspace. The data can be downloaded from Kaggle.
# Set the working directory to where the data file is located
setwd("~/ml projects/customer segmentation")
cust_data = read.csv("marketing_campaign.csv", sep = "\t")There are 29 columns (variables) and 2240 rows (observations). Each observation corresponds to a customer.
## [1] "ID" "Year_Birth" "Education"
## [4] "Marital_Status" "Income" "Kidhome"
## [7] "Teenhome" "Dt_Customer" "Recency"
## [10] "MntWines" "MntFruits" "MntMeatProducts"
## [13] "MntFishProducts" "MntSweetProducts" "MntGoldProds"
## [16] "NumDealsPurchases" "NumWebPurchases" "NumCatalogPurchases"
## [19] "NumStorePurchases" "NumWebVisitsMonth" "AcceptedCmp3"
## [22] "AcceptedCmp4" "AcceptedCmp5" "AcceptedCmp1"
## [25] "AcceptedCmp2" "Complain" "Z_CostContact"
## [28] "Z_Revenue" "Response"
## 'data.frame': 2240 obs. of 29 variables:
## $ ID : int 5524 2174 4141 6182 5324 7446 965 6177 4855 5899 ...
## $ Year_Birth : int 1957 1954 1965 1984 1981 1967 1971 1985 1974 1950 ...
## $ Education : chr "Graduation" "Graduation" "Graduation" "Graduation" ...
## $ Marital_Status : chr "Single" "Single" "Together" "Together" ...
## $ Income : int 58138 46344 71613 26646 58293 62513 55635 33454 30351 5648 ...
## $ Kidhome : int 0 1 0 1 1 0 0 1 1 1 ...
## $ Teenhome : int 0 1 0 0 0 1 1 0 0 1 ...
## $ Dt_Customer : chr "04-09-2012" "08-03-2014" "21-08-2013" "10-02-2014" ...
## $ Recency : int 58 38 26 26 94 16 34 32 19 68 ...
## $ MntWines : int 635 11 426 11 173 520 235 76 14 28 ...
## $ MntFruits : int 88 1 49 4 43 42 65 10 0 0 ...
## $ MntMeatProducts : int 546 6 127 20 118 98 164 56 24 6 ...
## $ MntFishProducts : int 172 2 111 10 46 0 50 3 3 1 ...
## $ MntSweetProducts : int 88 1 21 3 27 42 49 1 3 1 ...
## $ MntGoldProds : int 88 6 42 5 15 14 27 23 2 13 ...
## $ NumDealsPurchases : int 3 2 1 2 5 2 4 2 1 1 ...
## $ NumWebPurchases : int 8 1 8 2 5 6 7 4 3 1 ...
## $ NumCatalogPurchases: int 10 1 2 0 3 4 3 0 0 0 ...
## $ NumStorePurchases : int 4 2 10 4 6 10 7 4 2 0 ...
## $ NumWebVisitsMonth : int 7 5 4 6 5 6 6 8 9 20 ...
## $ AcceptedCmp3 : int 0 0 0 0 0 0 0 0 0 1 ...
## $ AcceptedCmp4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Complain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Z_CostContact : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Z_Revenue : int 11 11 11 11 11 11 11 11 11 11 ...
## $ Response : int 1 0 0 0 0 0 0 0 1 0 ...
## ID Year_Birth Education Marital_Status
## Min. : 0 Min. :1893 Length:2240 Length:2240
## 1st Qu.: 2828 1st Qu.:1959 Class :character Class :character
## Median : 5458 Median :1970 Mode :character Mode :character
## Mean : 5592 Mean :1969
## 3rd Qu.: 8428 3rd Qu.:1977
## Max. :11191 Max. :1996
##
## Income Kidhome Teenhome Dt_Customer
## Min. : 1730 Min. :0.0000 Min. :0.0000 Length:2240
## 1st Qu.: 35303 1st Qu.:0.0000 1st Qu.:0.0000 Class :character
## Median : 51382 Median :0.0000 Median :0.0000 Mode :character
## Mean : 52247 Mean :0.4442 Mean :0.5062
## 3rd Qu.: 68522 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :666666 Max. :2.0000 Max. :2.0000
## NA's :24
## Recency MntWines MntFruits MntMeatProducts
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.:24.00 1st Qu.: 23.75 1st Qu.: 1.0 1st Qu.: 16.0
## Median :49.00 Median : 173.50 Median : 8.0 Median : 67.0
## Mean :49.11 Mean : 303.94 Mean : 26.3 Mean : 166.9
## 3rd Qu.:74.00 3rd Qu.: 504.25 3rd Qu.: 33.0 3rd Qu.: 232.0
## Max. :99.00 Max. :1493.00 Max. :199.0 Max. :1725.0
##
## MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 3.00 1st Qu.: 1.00 1st Qu.: 9.00 1st Qu.: 1.000
## Median : 12.00 Median : 8.00 Median : 24.00 Median : 2.000
## Mean : 37.53 Mean : 27.06 Mean : 44.02 Mean : 2.325
## 3rd Qu.: 50.00 3rd Qu.: 33.00 3rd Qu.: 56.00 3rd Qu.: 3.000
## Max. :259.00 Max. :263.00 Max. :362.00 Max. :15.000
##
## NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.: 3.00 1st Qu.: 3.000
## Median : 4.000 Median : 2.000 Median : 5.00 Median : 6.000
## Mean : 4.085 Mean : 2.662 Mean : 5.79 Mean : 5.317
## 3rd Qu.: 6.000 3rd Qu.: 4.000 3rd Qu.: 8.00 3rd Qu.: 7.000
## Max. :27.000 Max. :28.000 Max. :13.00 Max. :20.000
##
## AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.07277 Mean :0.07455 Mean :0.07277 Mean :0.06429
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
##
## AcceptedCmp2 Complain Z_CostContact Z_Revenue
## Min. :0.00000 Min. :0.000000 Min. :3 Min. :11
## 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:3 1st Qu.:11
## Median :0.00000 Median :0.000000 Median :3 Median :11
## Mean :0.01339 Mean :0.009375 Mean :3 Mean :11
## 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:3 3rd Qu.:11
## Max. :1.00000 Max. :1.000000 Max. :3 Max. :11
##
## Response
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1491
## 3rd Qu.:0.0000
## Max. :1.0000
##
From the output of summary() function, it is noted that there are missing values for the Income attribute. Aside from that, there are some other things to keep in mind: 1. Feature like ID is probably not useful as it does not encode any information about a customer. I will keep it in the DataFrame variable but not using it for any analysis. 2. Feature like Z_CostContact and Z_Revenue are not useful attributes as they have zero variance.
## variance of Z_CostContact is: 0
## variance of Z_Revenue is: 0
- Some feature engineering is required for some other features like
Dt_CustomerandYear_Birth. - Feature restructuring from
charactertofactorfor features such asEducationandMarital Status.
Feature engineering
Removal of zero variance variables
cust_data_copy = cust_data
cust_data = subset(cust_data, select = -c(Z_CostContact, Z_Revenue))Get rid of invalid observations I
table(cust_data$Marital_Status)##
## Absurd Alone Divorced Married Single Together Widow YOLO
## 2 3 232 864 480 580 77 2
knitr::kable(cust_data %>% filter(Marital_Status == "YOLO"), "html", caption = "YOLO instances") %>% kable_styling("striped") %>% scroll_box(width = "100%")| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Response |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 492 | 1973 | PhD | YOLO | 48432 | 0 | 1 | 18-10-2012 | 3 | 322 | 3 | 50 | 4 | 3 | 42 | 5 | 7 | 1 | 6 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 11133 | 1973 | PhD | YOLO | 48432 | 0 | 1 | 18-10-2012 | 3 | 322 | 3 | 50 | 4 | 3 | 42 | 5 | 7 | 1 | 6 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
cust_data = cust_data %>% filter(Marital_Status != "YOLO")Feature extraction I
- Get the minimum number of household members from attributes
Marital_Status,KidhomeandTeenhome. - Get the total number of accepted offers.
# Calculate total number of accepted campaign, minimum number of household members.
min_num_member = function(x) {
if (x[1]=="Married" || x[1]=="Together"){
np = 2
} else {
np = 1
}
return(as.numeric(x[2]) + as.numeric(x[3]) + np)
}
min_num_household = apply(cust_data %>%
select(Marital_Status, Kidhome, Teenhome),
1, min_num_member)
cust_data$min_num_household = min_num_household
cust_data = cust_data %>% mutate(tot_AcceptedCmp = AcceptedCmp1 + AcceptedCmp2 + AcceptedCmp3 + AcceptedCmp4 + AcceptedCmp5 + Response)Restructuring of data types
# Change the date from string to date format
cust_data$Dt_Customer = as.Date(cust_data$Dt_Customer,
format = c("%d-%m-%Y"))
# cust_data %>% as_tibble() %>% glimpse()
# Change the categorical features from character to factor
categorical_features = c("Education", "Marital_Status", "AcceptedCmp1",
"AcceptedCmp2", "AcceptedCmp3", "AcceptedCmp4",
"AcceptedCmp5", "Complain", "Response")
for (i in 1:length(categorical_features)){
cust_data[[categorical_features[i]]] =
as.factor(cust_data[[categorical_features[i]]])
}
# ordered categorical variables: Kidhome and Teenhome
cust_data$Kidhome = factor(cust_data$Kidhome,
levels = seq(from = min(cust_data$Kidhome),
to = max(cust_data$Kidhome),
by = 1), ordered = TRUE)
cust_data$Teenhome = factor(cust_data$Teenhome,
levels = seq(from = min(cust_data$Teenhome),
to = max(cust_data$Teenhome),
by = 1), ordered = TRUE)Feature extraction II
Change the attribute Year_Birth to age and change the Dt_Customer to number of days a customer is enrolled with the company.
# Change Year_Birth to age
current_year = 2014
cust_data = cust_data %>% mutate(age = current_year - Year_Birth) %>%
subset(select = -c(Year_Birth))
# Change Dt_customer to days of enrollment. Assume that the current date is 1st July 2014
current_date = as.Date("2014-07-01")
cust_data = cust_data %>%
mutate(days_enroll = difftime(current_date, Dt_Customer, units = c("days")) %>%
as.numeric()) %>%
subset(select = -c(Dt_Customer))Get rid of outliers
summary(cust_data$age)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.0 37.0 44.0 45.2 55.0 121.0
cust_data = cust_data %>% filter(age < 100)
cust_data %>% arrange(desc(Income)) %>% head(3) %>% kable("html", caption = "Income in descending order") %>% kable_styling("striped") %>% scroll_box(width = "100%")| ID | Education | Marital_Status | Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumDealsPurchases | NumWebPurchases | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Response | min_num_household | tot_AcceptedCmp | age | days_enroll |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9432 | Graduation | Together | 666666 | 1 | 0 | 23 | 9 | 14 | 18 | 8 | 1 | 12 | 4 | 3 | 1 | 3 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 37 | 394 |
| 1503 | PhD | Together | 162397 | 1 | 1 | 31 | 85 | 1 | 16 | 2 | 1 | 2 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 38 | 393 |
| 1501 | PhD | Married | 160803 | 0 | 0 | 21 | 55 | 16 | 1622 | 17 | 3 | 4 | 15 | 0 | 28 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 32 | 696 |
cust_data = cust_data %>% filter(ID!=9432)Deal with missing values
There are 3 types of missing values:
1. missing completely at random (MCAR) 2. missing at random (MAR) 3. missing not at random (MNAR). Refer to this awesome article for more info on each types of missing values. There are several strategies to choose from when we encounter missing values in the data:
1. Complete case analysis (ignore observation(s) with missing values).
2. Single imputation - mean, median, regression model.
3. Multiple imputation - leverage distribution of observed data to estimate the missing values.
Multiple imputation is generally superior to single imputation because the missing values are calculated multiple times with many different plausible values being pooled together to get the estimates. For this experiment, Multiple Imputation by Chained Equations (MICE) is implemented to obtain the estimation of missing Income instances.
# Write a utility function to check for missing values.
check_missing_values = function(cust_data){
vec_missing = apply(cust_data, 2, function(x){ sum(any(is.na(x)))})
idx_missing = which(as.vector(vec_missing)==1)
if (length(idx_missing)>0){
cat("The columns(features) with missing values: ",
colnames(cust_data)[idx_missing])
} else {
print("No missing values")
}
}
check_missing_values(cust_data)## The columns(features) with missing values: Income
vis_miss(cust_data)cat(" \nThe number of missing values for income: ", sum(is.na(cust_data$Income)))##
## The number of missing values for income: 24
idx_missing = which(is.na(cust_data$Income))Perform multiple imputation.
# MICE implementation
imp = mice(cust_data, maxit = 0, print = F)
pred = imp$predictorMatrix
pred[,"ID"] = 0
# Use the default value of 5 iterations
imp = mice(cust_data, seed = 10, predictorMatrix = pred, print = F)
stripplot(imp, Income~.imp, pch = 20, cex = 2)cust_data = complete(imp, 5)
# Save the preprocessed data for further analysis
saveRDS(cust_data, file = "preprocessed_cust_data.rds")This wraps up the preprocessing of the data. Lets take a look at the results of missing
Income imputation
| ID | Education | Marital_Status | Income | |
|---|---|---|---|---|
| 11 | 1994 | Graduation | Married | 8028 |
| 28 | 5255 | Graduation | Single | 4023 |
| 44 | 7281 | PhD | Single | 8028 |
| 49 | 7244 | Graduation | Single | 4023 |
| 59 | 8557 | Graduation | Single | 5648 |
Data visualization
Data visualization is graphical representation of information and data.
Histogram and density plot
Lets visualize the household income. There is/are customer(s) with unusually high yearly household income based on empirical observation on the scale of x axis of the histogram plot.
#hist_data = cust_data %>% ggplot(aes(x = Income)) +
# geom_histogram(binwidth = 10000, boundary = 0)
cust_data %>% ggplot(aes(x = Income)) +
geom_histogram(aes(y = stat(count)/sum(count)), binwidth = 10000,
boundary = 0) +
ylab("Relative frequency") # density plot
cust_data %>% ggplot(aes(x = Income)) + geom_density(fill = "lightblue", alpha = 0.5) +
geom_vline(aes(xintercept=mean(Income)), color = "blue", linetype = 2) + geom_vline(aes(xintercept=median(Income)), color = "red", linetype = 3) + xlim(0, 2e+05) + theme_classic() + ggtitle("density plot for customer annual household income")cust_data %>% ggplot(aes(x = Income)) +
geom_histogram(aes(y = stat(count)/sum(count)), binwidth = 0.2,
boundary = 0) +
scale_x_log10() +
ylab("Relative frequency") +
ggtitle("Distribution of income in log10 scales")Most of the customers (99.5 %) have income of equal or less than 100k. In fact, (90.91 %) of the customers have annual household income within 20k-90k.
Scatter plots among continuous predictors
# Linear correlation and scatter plots matrix for amount spent on different merchandise
idx_cont_features = grepl("Mnt", colnames(cust_data))
cust_data[, idx_cont_features] %>% ggpairs(title = "correlogram")# Linear correlation and scatter plots matrix for number of purchases.
idx_cont_features = grepl("Num", colnames(cust_data))
cust_data[, idx_cont_features] %>% ggpairs(title = "correlogram")Correlation matrix / Correlogram
Linear correlation between numerical features.
seq = c("Mnt", "Num","Age", "days", "Income", "Recency", "household","tot")
a = sapply(seq, function(x) grep(x, colnames(cust_data)))
idx = c()
for (i in (1:length(seq))){
idx = c(idx, a[[i]])
}
corr_mat = cor(cust_data[,idx])
corr_mat[upper.tri(corr_mat)] = NA
melted_cormat = melt(corr_mat, na.rm = TRUE)
# The dimension becomes nx3 matrix
melted_cormat %>% ggplot(aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradientn(limit = c(-1,1), colors = hcl.colors(40, "Earth")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1)) + coord_fixed() +
geom_text(aes(Var1, Var2, label = round(value, 2)), color = "black", size = 1.5)Mosaic Plot
Response rate from different groups of population.
cust_data %>% ggplot() +
geom_mosaic(aes(x = product(Marital_Status, Education), fill = Response)) +
theme(axis.text.x = element_text(face = "bold", angle = 90),
axis.text.y = element_text(face = "bold")) +
ggtitle("Mosaic Plot of Response, Marital status and Education")cust_data %>% ggplot() +
geom_mosaic(aes(x = product(Kidhome, Teenhome), fill = Response)) +
ggtitle("Mosaic Plot of Response, Kidhome and Teenhome")Parallel set plots
data_group = cust_data %>% group_by(Education, Marital_Status, Kidhome,
Teenhome, Response) %>%
summarise(summ = n())## `summarise()` has grouped output by 'Education', 'Marital_Status', 'Kidhome',
## 'Teenhome'. You can override using the `.groups` argument.
data_parallel = reshape2::melt(data_group)## Using Education, Marital_Status, Kidhome, Teenhome, Response as id variables
data_parallel = gather_set_data(data_parallel, 1:5)
ggplot(data_parallel, aes(x, id = id, split = y, value = value)) +
geom_parallel_sets(aes(fill = Response), alpha = 0.3, axis.width = 0.1) +
geom_parallel_sets_axes(axis.width = 0.1) +
geom_parallel_sets_labels(colour = 'white') There is virtually no chance for the customers with 2 children or 2 teenagers in their household to accept the offer.
Grouped box plots / violin plots
Based on minimum number of household members, marital status and education.
Visualize income
Distribution of income for different categories of customers.
seq = c("Income", "Education", "Marital", "min")
idx_plot = sapply(seq, function(x) grep(x, colnames(cust_data)))
idx = c()
for (i in (1:length(seq))){
idx = c(idx, idx_plot[[i]])
}
cust_data %>% dplyr::select(idx) %>%
ggplot(aes(x = Education, y = Income, fill = Education)) +
geom_violin(scale = "count", draw_quantiles = c(0.25, 0.5, 0.75)) + stat_summary(fun = mean, geom = "point", shape=21, size=1.5, color = "red") + ylim(0, 2e+05)## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(idx)` instead of `idx` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
cust_data %>% dplyr::select(idx) %>% dplyr::filter(!(Marital_Status %in% c("Alone", "Absurd"))) %>%
ggplot(aes(x = Marital_Status, y = Income, fill = Marital_Status)) +
geom_violin(scale = "count", draw_quantiles = c(0.25, 0.5, 0.75)) + stat_summary(fun = mean, geom = "point", shape=21, size=1.5, color = "red") + ylim(0, 2e+05) + ggtitle("Distribution of household income based on marital status")cust_data %>% dplyr::select(idx) %>% mutate(num_household = as.factor(min_num_household)) %>%
ggplot(aes(x = num_household, y = Income, fill = num_household)) +
geom_violin(scale = "count", draw_quantiles = c(0.25, 0.5, 0.75)) + stat_summary(fun = mean, geom = "point", shape=21, size=1.5, color = "red") + ylim(0, 2e+05) + labs(title = "Distribution of household income based on minimum number of household members", x = "minimum number of household members")## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
Visualize expenditure
Show expenditure of different types of products.
seq = c("Mnt", "Education", "Marital", "min")
idx_plot = sapply(seq, function(x) grep(x, colnames(cust_data)))
idx = c()
for (i in (1:length(seq))){
idx = c(idx, idx_plot[[i]])
}
cust_data %>% dplyr::select(idx) %>%
pivot_longer(starts_with("Mnt"), names_to = "purchase_types",
values_to = "amount_spent") %>%
ggplot(aes(x = Education, y = amount_spent)) +
geom_boxplot(fill = "chocolate2", notch = TRUE) + stat_summary(fun = mean, geom = "point", shape=21, size=1.5, color = "red") + theme_classic() + facet_wrap(~purchase_types, ncol = 2, scales = "free_y")cust_data %>% dplyr::filter(!(Marital_Status %in% c("Alone", "Absurd"))) %>% dplyr::select(idx) %>%
pivot_longer(starts_with("Mnt"), names_to = "purchase_types",
values_to = "amount_spent") %>%
ggplot(aes(x = Marital_Status, y = amount_spent)) +
geom_boxplot(fill = "chocolate2", notch = TRUE) + stat_summary(fun = mean, geom = "point", shape=21, size=1.5, color = "red") + theme_classic() + facet_wrap(~purchase_types, ncol = 2, scales = "free_y") ## notch went outside hinges. Try setting notch=FALSE.
cust_data %>% dplyr::select(idx) %>% mutate(num_household = as.factor(min_num_household)) %>%
pivot_longer(starts_with("Mnt"), names_to = "purchase_types",
values_to = "amount_spent") %>%
ggplot(aes(x = num_household, y = amount_spent)) +
geom_boxplot(fill = "chocolate2", notch = TRUE) + stat_summary(fun = mean, geom = "point", shape=21, size=1.5, color = "red") + theme_classic() + facet_wrap(~purchase_types, ncol = 2, scales = "free_y") ## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## _
## platform x86_64-w64-mingw32
## arch x86_64
## os mingw32
## system x86_64, mingw32
## status
## major 4
## minor 1.2
## year 2021
## month 11
## day 01
## svn rev 81115
## language R
## version.string R version 4.1.2 (2021-11-01)
## nickname Bird Hippie