CustomerData <- read_delim("marketing_campaign.csv",
delim = "\t", escape_double = FALSE,
col_types = cols(Year_Birth = col_double(),
Education = col_factor(levels = c("2n Cycle",
"Basic", "Graduation", "Master",
"PhD")), Marital_Status = col_factor(levels = c("Yolo",
"Absurd", "Alone", "Divorced",
"Married", "Single", "Together",
"Widow")), Dt_Customer = col_date(format = "%d-%m-%Y")), trim_ws = TRUE)
CustomerData <- subset(CustomerData, Year_Birth > 1900)
CustomerData <- subset(CustomerData, Income < 500000)
CustomerData <- CustomerData[CustomerData$Marital_Status != "Absurd", ]
CustomerData <- CustomerData[CustomerData$Marital_Status != "YOLO", ]
CustomerData <- CustomerData <- CustomerData[CustomerData$Marital_Status != "Alone", ]
CustomerData <- CustomerData[!is.na(CustomerData$Income), ]
CustomerData <- CustomerData[, c(-27,-28)]
CustomerData$AmountSpent <- CustomerData$MntWines + CustomerData$MntFruits + CustomerData$MntMeatProducts + CustomerData$MntFishProducts + CustomerData$MntSweetProducts + CustomerData$MntGoldProds
CustomerData$NumPurchases <- CustomerData$NumDealsPurchases + CustomerData$NumWebPurchases + CustomerData$NumCatalogPurchases + CustomerData$NumStorePurchases
CustomerData$Age <- 2024 - CustomerData$Year_BirthCustomerEDA_Week2
Clean Data
Note: removed total number of kids column and grouped back to kid and teen home
Clustering
Correlation to amount spent
CorrMatrix2 <- CustomerData[, c("Age", "Income", "Kidhome","Teenhome", "AmountSpent", "Recency", "NumPurchases")]
cor_calc2 <- cor(CorrMatrix2)
corrplot(cor_calc2, method = "ellipse", type = "upper",
tl.col = "black", tl.srt = 45, addCoef.col = "black",
number.cex = 1) Results
Without kids and teens grouped together can see that kid home was a lot more of the weight leading to a negative linear correlation with amount spent
Income, kid home, and number of purchases most correlated with amount spent
Number of purchases is co linear to amount spent, thus don’t use in clustering
PCA
pca_data <- CustomerData[, c("Income","AmountSpent", "Kidhome")]
pca_data <- scale(pca_data)
pca_customer = prcomp(pca_data, center = TRUE, scale = TRUE)
summary(pca_customer)Importance of components:
PC1 PC2 PC3
Standard deviation 1.5006 0.7367 0.45314
Proportion of Variance 0.7506 0.1809 0.06845
Cumulative Proportion 0.7506 0.9315 1.00000
customer_transform <- as.data.frame(-pca_customer$x[,1:2])Note
- PCA reduces dimensions in data set by transforming correlated variables into principle components
Results
- First two principle components explain 93% of data variability so can reduce data down to 2-dimensions
K Means Clustering
fviz_nbclust(customer_transform, kmeans, method = 'wss')fviz_nbclust(customer_transform, kmeans, method = 'silhouette')k = 3
kmeans_customer = kmeans(customer_transform, centers = k, nstart = 50)
print(kmeans_customer)K-means clustering with 3 clusters of sizes 883, 538, 784
Cluster means:
PC1 PC2
1 -1.4843575 -0.4293951
2 -0.0429773 0.9790346
3 1.7012876 -0.1882204
Clustering vector:
[1] 3 1 3 1 1 2 2 1 1 1 2 3 1 2 3 1 2 3 1 2 1 3 2 2 2 1 1 3 2 2 2 2 3 1 3 1 2
[38] 2 3 1 2 1 3 2 1 3 2 3 1 3 1 3 3 1 2 3 1 3 3 3 1 1 3 3 3 3 3 1 2 1 3 3 2 2
[75] 1 1 1 2 3 1 1 2 3 2 1 1 1 2 1 3 2 2 2 3 3 3 1 1 2 1 3 3 3 2 3 1 1 3 1 1 1
[112] 2 1 1 1 3 3 3 1 3 2 3 1 3 1 1 1 3 3 3 3 1 3 1 1 1 2 2 2 2 3 3 2 1 2 3 1 3
[149] 1 3 3 1 2 2 3 1 1 1 1 1 2 3 3 1 1 3 2 1 3 1 2 1 2 2 3 1 1 3 1 1 1 3 3 3 1
[186] 2 3 3 3 1 2 1 1 2 3 2 3 1 1 3 1 1 3 2 3 1 3 3 1 3 1 2 3 3 1 1 3 1 2 3 1 1
[223] 3 1 3 3 1 3 3 1 3 3 3 3 1 1 3 2 3 1 3 2 1 1 1 3 1 1 1 1 3 1 3 1 3 1 2 1 1
[260] 3 3 3 3 3 1 2 2 2 1 1 3 1 3 2 2 1 3 1 1 3 1 1 3 3 1 2 1 1 1 3 2 3 2 2 1 1
[297] 3 1 1 1 1 1 3 1 1 3 2 3 1 1 1 1 1 2 3 1 1 3 3 1 3 3 2 3 2 1 3 1 3 1 1 3 3
[334] 3 3 3 1 2 3 3 2 3 2 1 1 1 2 3 1 3 2 1 1 1 3 1 1 2 1 3 1 1 1 2 1 1 1 2 3 2
[371] 3 3 1 3 2 3 3 1 1 1 1 1 3 1 2 1 1 1 1 1 2 1 2 3 1 3 3 1 3 3 1 1 1 1 3 3 1
[408] 3 3 1 3 3 1 3 3 1 2 3 2 1 1 1 2 2 1 2 2 1 3 1 2 2 3 1 1 3 1 3 1 2 3 3 3 2
[445] 3 1 3 3 1 2 1 3 1 1 1 1 3 1 2 2 1 1 1 1 2 3 3 3 2 1 2 3 1 3 3 2 2 2 3 3 3
[482] 1 2 2 3 2 3 3 3 2 2 1 3 2 3 2 3 1 1 3 3 1 3 2 3 1 1 3 2 3 1 2 3 1 1 1 2 2
[519] 3 2 1 1 2 1 3 2 1 3 1 1 1 1 1 2 2 3 1 1 3 2 3 1 3 3 3 2 2 3 1 2 1 2 1 1 2
[556] 1 3 1 1 1 1 1 2 1 2 1 1 3 2 2 2 1 3 3 1 2 1 1 1 1 1 2 3 3 2 1 1 2 1 2 3 2
[593] 2 1 1 2 1 1 3 1 2 2 2 3 1 3 1 3 3 1 1 3 3 3 2 3 1 3 2 3 3 3 3 3 2 3 1 3 1
[630] 3 2 3 1 3 2 2 3 2 2 3 1 2 1 3 2 2 2 2 1 1 1 3 2 3 3 2 1 3 3 1 3 3 3 2 2 3
[667] 3 3 3 3 3 2 3 2 2 1 2 2 1 2 3 3 1 3 3 1 3 1 2 3 1 1 2 1 2 1 3 3 1 3 1 3 3
[704] 1 3 1 2 3 3 1 3 1 1 1 1 3 3 3 2 1 3 3 1 1 1 3 3 1 3 1 3 3 2 3 3 3 3 3 3 1
[741] 2 1 1 3 1 3 2 3 3 2 2 3 3 1 1 2 1 2 3 1 3 3 1 1 1 1 1 1 2 1 2 3 3 2 1 2 2
[778] 3 3 3 1 2 1 1 3 3 2 2 2 1 3 2 1 3 2 3 2 1 2 3 1 3 1 2 2 3 3 3 1 3 1 1 2 2
[815] 1 1 3 1 3 2 2 1 2 1 1 3 3 3 2 2 1 2 2 3 1 2 3 2 1 1 3 1 1 2 1 1 2 2 1 2 1
[852] 3 3 1 1 1 3 1 1 3 1 1 2 1 1 3 3 1 1 1 3 1 1 3 1 2 3 3 3 1 1 3 2 3 2 1 3 3
[889] 1 1 2 3 3 3 1 3 3 3 3 2 3 1 3 1 1 3 2 3 3 3 3 3 1 3 1 3 1 3 3 2 1 3 2 3 3
[926] 2 3 2 1 1 3 1 1 1 1 1 1 2 2 1 2 3 2 1 2 1 2 3 1 1 1 3 1 1 2 3 3 3 1 1 3 1
[963] 2 1 3 3 2 3 3 3 1 3 1 1 1 1 2 3 1 1 2 3 3 1 1 2 2 3 3 1 1 3 2 1 2 1 2 3 2
[1000] 1 2 1 1 3 2 1 2 1 1 2 3 3 3 3 1 3 2 1 1 1 2 2 1 2 3 1 2 2 3 1 2 3 1 3 1 1
[1037] 3 2 2 3 3 3 3 2 1 2 1 3 3 2 3 1 3 2 2 2 3 3 1 1 1 3 1 3 1 3 2 1 3 1 3 3 1
[1074] 3 1 1 2 3 3 1 3 3 2 1 1 1 3 2 2 1 1 3 3 1 3 1 3 1 2 2 2 3 1 1 1 2 1 3 1 2
[1111] 3 3 1 1 3 3 1 1 3 2 2 2 1 1 2 3 1 1 1 2 1 3 3 2 3 2 1 1 3 3 3 2 1 2 3 2 2
[1148] 2 1 3 3 2 2 3 1 1 2 1 2 1 3 2 2 3 1 1 1 1 3 2 2 3 3 2 2 1 3 2 3 3 3 1 2 2
[1185] 1 3 2 3 2 2 2 2 2 3 3 1 1 2 1 3 2 1 1 3 1 1 3 3 1 2 1 1 1 2 1 1 1 2 3 1 1
[1222] 1 1 3 3 1 2 1 1 1 3 3 3 3 3 3 1 2 3 1 3 1 3 3 1 1 3 1 1 2 3 3 3 1 1 1 3 1
[1259] 1 3 2 3 3 1 2 2 1 2 1 3 2 2 1 1 1 1 2 1 2 3 1 3 3 2 1 3 3 1 3 3 3 3 3 2 3
[1296] 1 1 2 1 1 1 3 2 3 1 1 3 2 1 2 3 1 1 3 3 3 2 3 2 2 1 2 2 3 2 1 1 2 2 3 3 3
[1333] 3 1 2 3 3 1 2 3 1 1 1 1 2 2 3 2 1 1 2 2 1 1 2 3 2 1 1 2 2 1 1 3 1 1 3 1 1
[1370] 1 1 1 3 1 1 3 1 1 3 2 2 1 1 2 3 2 1 3 3 1 3 1 1 2 2 1 1 1 3 3 2 2 1 1 1 1
[1407] 2 1 1 3 1 1 1 1 2 2 1 1 1 1 3 3 2 3 2 3 3 1 3 3 1 2 3 2 1 3 2 2 2 1 1 1 2
[1444] 2 3 1 3 1 1 2 3 3 2 3 2 2 3 3 3 1 1 3 3 3 1 3 1 1 1 3 1 1 3 1 3 3 3 1 1 1
[1481] 1 3 3 1 2 3 2 3 1 3 3 2 1 2 1 1 1 3 3 1 1 1 1 3 1 3 2 3 1 3 1 1 1 1 3 3 3
[1518] 1 3 3 1 2 1 2 2 3 1 2 3 3 3 3 1 1 1 1 3 1 2 1 3 1 2 3 3 1 3 3 1 3 1 1 2 1
[1555] 3 2 2 1 3 3 2 3 1 1 2 1 3 2 3 1 2 2 2 1 1 3 1 2 3 3 1 1 1 3 1 2 2 3 2 3 1
[1592] 1 1 1 1 2 3 2 1 3 1 1 2 2 2 1 1 3 1 1 1 2 2 2 3 1 1 1 3 1 1 3 2 2 2 3 1 1
[1629] 3 1 3 2 1 2 3 2 3 1 1 3 1 2 1 3 1 3 3 2 3 3 3 3 2 1 1 1 1 3 1 1 1 2 1 3 2
[1666] 3 3 2 3 1 1 2 2 1 3 1 3 3 1 1 2 1 1 3 2 1 2 3 3 2 3 1 1 3 1 1 2 1 3 3 3 1
[1703] 1 1 2 3 3 1 2 1 3 3 3 1 3 2 1 2 1 2 3 2 3 2 3 3 3 3 2 1 2 1 1 1 1 1 3 2 1
[1740] 2 3 1 3 1 3 2 2 1 2 3 3 2 1 1 2 1 2 1 1 3 3 1 1 2 1 3 3 1 1 3 1 1 1 2 1 3
[1777] 3 3 3 1 1 1 2 2 1 3 3 1 2 3 3 3 1 2 3 2 1 3 1 1 3 3 1 2 3 3 1 1 2 2 1 1 1
[1814] 3 2 2 3 1 3 1 3 1 3 1 2 2 1 3 1 3 3 3 3 1 2 3 1 3 2 2 3 3 1 1 1 1 3 1 3 1
[1851] 2 2 2 2 3 3 3 3 1 2 3 2 1 1 3 3 1 3 3 1 2 3 2 1 3 3 2 1 2 2 1 2 3 3 1 2 1
[1888] 3 3 3 3 3 2 2 1 2 1 2 3 3 3 3 1 3 3 3 1 1 2 1 1 3 1 2 2 2 1 3 2 1 3 1 3 3
[1925] 3 1 2 2 2 3 3 2 3 1 2 3 1 3 3 1 1 2 3 3 3 3 3 3 1 1 2 1 2 2 1 1 1 1 1 3 2
[1962] 1 2 1 2 1 2 3 2 3 3 1 3 3 2 1 2 1 1 1 2 1 2 3 2 2 3 3 1 3 2 3 1 2 1 1 1 1
[1999] 1 1 3 3 1 1 2 2 1 3 3 1 3 1 2 3 2 1 3 2 3 1 1 1 1 2 3 3 3 1 1 3 1 1 2 3 3
[2036] 3 1 3 1 3 3 2 3 1 1 2 3 1 2 3 1 1 2 1 2 3 3 3 3 1 2 1 2 2 2 1 3 1 3 3 1 3
[2073] 1 3 2 1 2 3 2 3 1 2 1 2 2 3 1 3 3 1 1 1 2 2 1 3 2 3 3 1 1 3 3 1 2 1 1 2 1
[2110] 1 1 1 2 3 3 1 1 1 2 1 3 1 1 1 1 1 2 2 2 3 1 2 3 1 1 3 3 3 1 3 3 3 3 3 3 3
[2147] 2 2 1 2 2 1 2 3 3 3 3 1 3 1 2 3 3 1 2 2 1 1 1 2 3 2 1 3 1 2 1 2 3 1 3 2 1
[2184] 1 3 1 1 2 3 1 1 2 1 1 2 1 1 2 2 1 3 1 3 3 1
Within cluster sum of squares by cluster:
[1] 450.0028 265.6369 521.8228
(between_SS / total_SS = 79.9 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
Results
3 main clusters
80% of total variance is explained by clusters, which tells us the clusters are well separated and good overall
fviz_cluster(kmeans_customer, data = customer_transform)pca_data <- as.data.frame(pca_data)
pca_data$Cluster = as.factor(kmeans_customer$cluster)
pca_data_long <- pca_data %>%
pivot_longer(cols = -Cluster, names_to = "Variable", values_to = "Value")
pca_mean <- pca_data_long %>%
group_by(Cluster, Variable) %>%
summarize(Mean_Value = mean(Value, na.rm = TRUE)) %>%
ungroup()`summarise()` has grouped output by 'Cluster'. You can override using the
`.groups` argument.
x <- ggplot(pca_mean, aes(x = Variable, y = Mean_Value, group = Cluster, color = as.factor(Cluster))) +
geom_line(size = 1) +
geom_point(size = 2) + # Optional: add points to show the mean value for each cluster
labs(title = "Mean Values of Variables by Cluster",
x = "Variable",
y = "Mean Value",
color = "Cluster",
linetype = "Cluster") +
theme_minimal() +
scale_linetype_manual(values = c("solid", "dashed", "dotted")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for better visibility
ggplotly(x)Clusters
Cluster 1: Low amount spent, low income, high kid home
Cluster 2: Medium - low amount spent, medium - low income, low kid home
Cluster 3: High amount spent, high income, low kid home
Cluster vs. amount spent on each good
CustomerData$Cluster <- as.factor(kmeans_customer$cluster)
CustomerData$MntWines.Per <- CustomerData$MntWines / CustomerData$AmountSpent * 100
CustomerData$MntMeat.Per <- CustomerData$MntMeatProducts / CustomerData$AmountSpent * 100
CustomerData$MntFish.Per <- CustomerData$MntFishProducts / CustomerData$AmountSpent * 100
CustomerData$MntSweet.Per <- CustomerData$MntSweetProducts / CustomerData$AmountSpent * 100
CustomerData$MntGold.Per <- CustomerData$MntGoldProds / CustomerData$AmountSpent * 100
MntGood <- CustomerData[, c("MntWines.Per", "MntMeat.Per", "MntFish.Per",
"MntSweet.Per", "MntGold.Per", "Cluster")]
MntGood_long <- MntGood %>%
pivot_longer(cols = -Cluster, names_to = "Variable", values_to = "Value")
# Create stacked bar chart
y <- ggplot(MntGood_long, aes(x = Variable, y = Value, fill = Cluster)) +
geom_bar(stat = "identity", position = "fill") +
ylab("Proportion of Amount Spent") +
xlab("Cluster") +
ggtitle("Proportion of Amount Spent per Item by Cluster") +
scale_y_continuous(labels = scales::percent) +
theme_minimal()
yNote: Normalized amount spent per item to the amount spent total
Results
Cluster 1: Spent most amount of money on gold products
Cluster 2: Least amount spent on meat products, relatively even in other products
Cluster 3: Most spent on meat and wine products, little spent on gold products
Cluster vs. Accepted Campaign
Campaign <- CustomerData[, c("AcceptedCmp1","AcceptedCmp2","AcceptedCmp3","AcceptedCmp4","AcceptedCmp5","Response", "Cluster")]
n <- 538 # Specify the number of rows you want from each category
# Randomly select the same amount of rows for each factor
set.seed(123) # Set seed for reproducibility
Equal_Cluster <- Campaign %>%
group_by(Cluster) %>%
sample_n(n, replace = TRUE) %>% # Sample n rows from each category
ungroup()
campaign_summary <- Equal_Cluster %>%
group_by(Cluster) %>%
summarise(
C1 = (sum(AcceptedCmp1) / n) * 100,
C2 = (sum(AcceptedCmp2) / n) * 100,
C3 = (sum(AcceptedCmp3) / n) * 100,
C4 = (sum(AcceptedCmp4) / n) * 100,
C5 = (sum(AcceptedCmp5) / n) * 100,
Response = (sum(Response) / n) * 100
) %>%
pivot_longer(cols = c(2:7), names_to = "Campaign", values_to = "Accepted")
CampignCluster <- ggplot(campaign_summary, aes(x=Campaign, y=Accepted)) +
geom_segment( aes(x=Campaign, xend=Campaign, y=0, yend=Accepted)) +
geom_point( aes(color=as.factor(Cluster)), size=4, alpha=0.6) +
facet_wrap(~Cluster) +
labs(title = "Lollipop Chart by Cluster",
x = "Campaign",
y = "Accepted Percentage",
color = "Cluster") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank()
)
ggplotly(CampignCluster)Results:
Cluster 1 and 2 more likely to accept campaign 3 and 4
Cluster 3 more likely to accept campaign 1, 4, and 5
Would it be a better idea for the company to focus more on catering their advertising to a higher number of low income people or a lower number of high income people?
CustomerData$IncomeBracket <- cut(CustomerData$Income,
breaks = c(0, 30000, 60000, 90000, 120000, Inf),
labels = c("0-30k", "30k-60k", "60k-90k", "90k-120k", "120k+"),
right = FALSE)
NumPeopleByIncomeBracket <- ggplot(CustomerData, aes(x = IncomeBracket)) +
geom_histogram(stat = "count", fill = "skyblue", color = "black") +
labs(title = "Distribution of People in Each Income Bracket",
x = "Income Bracket",
y = "Number of People") +
theme_minimal()Warning in geom_histogram(stat = "count", fill = "skyblue", color = "black"):
Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
ggplotly(NumPeopleByIncomeBracket)IncomeBracket_Spending <- CustomerData %>%
group_by(IncomeBracket) %>%
summarise(
TotalSpent = round(sum(AmountSpent, na.rm = TRUE), 2),
PeopleCount = n(),
AvgSpentPerPerson = round(TotalSpent / PeopleCount, 2)
) %>%
mutate(IncomeGroup = ifelse(IncomeBracket %in% c("0-30k", "30k-60k"),
"Low Income",
ifelse(IncomeBracket %in% c("60k-90k", "90k-120k", "120k+"),
"High Income", NA)))
print(IncomeBracket_Spending)# A tibble: 5 × 5
IncomeBracket TotalSpent PeopleCount AvgSpentPerPerson IncomeGroup
<fct> <dbl> <int> <dbl> <chr>
1 0-30k 26708 370 72.2 Low Income
2 30k-60k 298385 999 299. Low Income
3 60k-90k 929171 783 1187. High Income
4 90k-120k 79775 46 1734. High Income
5 120k+ 5235 7 748. High Income
IncomeGroup_Spending <- IncomeBracket_Spending %>%
group_by(IncomeGroup) %>%
summarise(
TotalSpent = sum(TotalSpent, na.rm = TRUE),
PeopleCount = sum(PeopleCount),
AvgSpentPerPerson = round(TotalSpent / PeopleCount, 2)
)
print(IncomeGroup_Spending)# A tibble: 2 × 4
IncomeGroup TotalSpent PeopleCount AvgSpentPerPerson
<chr> <dbl> <int> <dbl>
1 High Income 1014181 836 1213.
2 Low Income 325093 1369 237.
# Bar plot for Total Spending by Income Group
p1 <- ggplot(IncomeGroup_Spending, aes(x = IncomeGroup, y = TotalSpent, fill = IncomeGroup)) +
geom_bar(stat = "identity") +
labs(title = "Total Spending by Income Group",
x = "Income Group",
y = "Total Spent") +
theme_minimal()
# Bar plot for Total Number of People by Income Group
p2 <- ggplot(IncomeGroup_Spending, aes(x = IncomeGroup, y = PeopleCount, fill = IncomeGroup)) +
geom_bar(stat = "identity") +
labs(title = "Total Number of People by Income Group",
x = "Income Group",
y = "# of People") +
theme_minimal()
# Bar plot for Average Amount Spent per Person by Income Group
p3 <- ggplot(IncomeGroup_Spending, aes(x = IncomeGroup, y = AvgSpentPerPerson, fill = IncomeGroup)) +
geom_bar(stat = "identity") +
labs(title = "Average Amount Spent per Person by Income Group",
x = "Income Group",
y = "Avg Spent per Person") +
theme_minimal()
# Combine the plots
combined_plot <- (p2 / p3 / p1) + plot_layout(ncol = 1, heights = c(3, 3, 3))
combined_plotNotes:
The low income group contains almost twice as many shoppers as the high income group does.
The average shopper in the low income group is spending less than 20% as much as the average shopper in the high income group.
The high income group has spent more than 3x as much as the low income group, even though there are half as many people.
Conclusion:
- If the company wants to maximize profits, they should cater their advertising towards higher income shoppers, even if there are fewer of them. This group of shoppers are bringing the company 3x as much income, so the company should focus its marketing efforts on bringing in more of this type of shopper.