#Read in all libraries
library(readr)
library(dplyr)
library(ggplot2)
dim(pp_cust_data)
[1] 2172 3
dim(sub_data)
[1] 679 5
summary(full_data)
email_address active_send active_receive pp_ind industry relationship_length site_visits sub_ind
Length:2716 Min. :0.0000 Min. :0.0000 Min. :1 : 103 Min. : 1.000 Min. : 0.0 Min. :1
Class :character 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:1 home and garden : 44 1st Qu.: 2.000 1st Qu.: 28.0 1st Qu.:1
Mode :character Median :1.0000 Median :0.0000 Median :1 outdoor : 44 Median : 5.000 Median : 97.0 Median :1
Mean :0.7813 Mean :0.4848 Mean :1 landscape engineer: 39 Mean : 8.931 Mean : 434.2 Mean :1
3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1 nursery : 39 3rd Qu.:13.000 3rd Qu.: 301.0 3rd Qu.:1
Max. :1.0000 Max. :1.0000 Max. :1 (Other) : 410 Max. :30.000 Max. :16551.0 Max. :1
NA's :544 NA's :544 NA's :544 NA's :2037 NA's :2037 NA's :2037 NA's :2037
summary(common_data)
email_address active_send active_receive pp_ind industry relationship_length site_visits sub_ind
Length:135 Min. :0.0000 Min. :0.0000 Min. :1 :19 Min. : 1.000 Min. : 0.0 Min. :1
Class :character 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1 garden :12 1st Qu.: 2.000 1st Qu.: 43.5 1st Qu.:1
Mode :character Median :0.0000 Median :0.0000 Median :1 outdoor :11 Median : 6.000 Median : 145.0 Median :1
Mean :0.3481 Mean :0.3037 Mean :1 landscape designer: 8 Mean : 8.837 Mean : 764.7 Mean :1
3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1 landscape engineer: 8 3rd Qu.:13.000 3rd Qu.: 520.0 3rd Qu.:1
Max. :1.0000 Max. :1.0000 Max. :1 gardening : 7 Max. :30.000 Max. :16551.0 Max. :1
(Other) :70
ggplot(data = common_data, aes(x = industry, fill = industry)) +
geom_bar(width=0.7, fill="steelblue", angle = 45) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Ignoring unknown parameters: angle
# compute unique levels in data frame
lvls <- unique(unlist(common_data$industry))
# apply the summation per value
freq <- sapply(common_data,
function(x) table(factor(x, levels = lvls,
ordered = TRUE)))
freq
email_address active_send active_receive pp_ind industry relationship_length site_visits sub_ind
0 0 0 0 19 0 0 0
supply 0 0 0 0 7 0 0 0
designer 0 0 0 0 3 0 0 0
outdoor 0 0 0 0 11 0 0 0
outdoor living 0 0 0 0 7 0 0 0
orchard 0 0 0 0 5 0 0 0
plants 0 0 0 0 7 0 0 0
grower 0 0 0 0 7 0 0 0
nursery 0 0 0 0 7 0 0 0
landscape designer 0 0 0 0 8 0 0 0
garden 0 0 0 0 12 0 0 0
landscaper 0 0 0 0 2 0 0 0
vineyard 0 0 0 0 4 0 0 0
landscape engineer 0 0 0 0 8 0 0 0
gardening 0 0 0 0 7 0 0 0
landscape architect 0 0 0 0 6 0 0 0
architect 0 0 0 0 4 0 0 0
landscaping 0 0 0 0 4 0 0 0
home and garden 0 0 0 0 5 0 0 0
hg 0 0 0 0 2 0 0 0
#Active Transactions in the last year
#Average relationship length of these active transactors is 6 years
#Most of these individuals are also frequent site visitors (24 visits/year, 2 site visits a month)
active <- common_data %>% filter(active_send==1 | active_receive ==1)
summary(active)
email_address active_send active_receive pp_ind industry relationship_length site_visits sub_ind
Length:80 Min. :0.0000 Min. :0.0000 Min. :1 :10 Min. : 1.000 Min. : 0.0 Min. :1
Class :character 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1 garden : 8 1st Qu.: 2.000 1st Qu.: 51.5 1st Qu.:1
Mode :character Median :1.0000 Median :1.0000 Median :1 landscape designer: 7 Median : 6.000 Median : 145.5 Median :1
Mean :0.5875 Mean :0.5125 Mean :1 outdoor : 6 Mean : 8.488 Mean : 623.9 Mean :1
3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1 grower : 5 3rd Qu.:12.000 3rd Qu.: 446.5 3rd Qu.:1
Max. :1.0000 Max. :1.0000 Max. :1 landscape engineer: 5 Max. :30.000 Max. :16551.0 Max. :1
(Other) :39
lvls <- unique(unlist(full_data$industry))
# apply the summation per value
freq <- sapply(full_data,
function(x) table(factor(x, levels = lvls,
ordered = TRUE)))
freq
email_address active_send active_receive pp_ind industry relationship_length site_visits sub_ind
0 0 0 0 103 0 0 0
supply 0 0 0 0 36 0 0 0
designer 0 0 0 0 28 0 0 0
outdoor 0 0 0 0 44 0 0 0
outdoor living 0 0 0 0 38 0 0 0
orchard 0 0 0 0 22 0 0 0
plants 0 0 0 0 25 0 0 0
grower 0 0 0 0 29 0 0 0
nursery 0 0 0 0 39 0 0 0
landscape designer 0 0 0 0 28 0 0 0
garden 0 0 0 0 34 0 0 0
landscaper 0 0 0 0 15 0 0 0
vineyard 0 0 0 0 24 0 0 0
landscape engineer 0 0 0 0 39 0 0 0
gardening 0 0 0 0 31 0 0 0
landscape architect 0 0 0 0 29 0 0 0
architect 0 0 0 0 27 0 0 0
landscaping 0 0 0 0 17 0 0 0
home and garden 0 0 0 0 44 0 0 0
hg 0 0 0 0 27 0 0 0
#Select only needed columns
final <- full_data %>% select(relationship_length, site_visits, inboth, industry_ind, sv_yr, active)
final <- as.matrix(final)
final <- prop.table(final, margin = 2)
final_set <- bind_cols(as.data.frame(full_data[,1]), as.data.frame(final))
final_set
# Running the elbow method
#Code Source: https://uc-r.github.io/kmeans_clustering
library(cluster) # Needed for silhouette function
require(purrr)
kmeansDat <- final_set[,-(1)] # Extract only customer columns
kmeansDat.t <- t(kmeansDat) # Get customers in rows and products in columns
set.seed(123)
wss <- function(k) {
kmeans(kmeansDat, k, nstart = 10 )$tot.withinss
}
# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15
# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
#Run Silhoute Method in Conjunction
#Code Source: https://uc-r.github.io/kmeans_clustering
#2 clusters is the winner
avg_sil <- function(k) {
km.res <- kmeans(kmeansDat, centers = k, nstart = 25)
ss <- silhouette(km.res$cluster, dist(kmeansDat))
mean(ss[, 3])
}
# Compute and plot wss for k = 2 to k = 15
k.values <- 2:15
# extract avg silhouette for 2-15 clusters
avg_sil_values <- map_dbl(k.values, avg_sil)
plot(k.values, avg_sil_values,
type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouettes")
#Final
set.seed(123)
final <- kmeans(kmeansDat, 4, nstart = 25)
print(final)
K-means clustering with 4 clusters of sizes 131, 2379, 199, 7
Cluster means:
relationship_length site_visits inboth industry_ind sv_yr active
1 1.389756e-03 1.339122e-03 0.007407407 0.0014843087 2.029560e-03 3.144178e-04
2 9.884755e-05 2.684723e-05 0.000000000 0.0002087128 8.491371e-05 4.028121e-04
3 2.780227e-03 2.179866e-03 0.000000000 0.0014918342 1.936575e-03 0.000000e+00
4 4.216924e-03 4.670173e-02 0.004232804 0.0017361111 2.096279e-02 7.448235e-05
Clustering vector:
[1] 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 1 2 2 1 2 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2
[88] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 1 2 1 2 2 2 2 2 2 2
[175] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[262] 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[349] 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[436] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[523] 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 2 2 2 2
[610] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2
[697] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[784] 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2
[871] 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[958] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2
[ reached getOption("max.print") -- omitted 1716 entries ]
Within cluster sum of squares by cluster:
[1] 0.002729883 0.001347098 0.002742412 0.000925972
(between_SS / total_SS = 79.0 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size" "iter" "ifault"
#Visually able to see data is segmented, and I think one of these clusters would be our appropriate demographic
fviz_cluster(final, data = kmeansDat)
full_data %>%
mutate(Cluster = final$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean")
argument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NA
out <- cbind(full_data, clusterNum = final$cluster)
head(out)
#Cluster 1: 131 Customers = Can't Target Existing Customers!
#Bad: Current customers
#Okay: 8 year relationship length, active rate strong
#Good: Industry indicator strong, 53 site visits per year, 1/week
out %>% filter(clusterNum == 1)
out %>% filter(clusterNum == 2) %>% filter(sub_ind == 1) %>% group_by(clusterNum) %>%
summarise_all("mean")
argument is not numeric or logical: returning NAargument is not numeric or logical: returning NA
out %>% filter(clusterNum == 3) %>% filter(sub_ind == 1) %>% group_by(clusterNum) %>%
summarise_all("mean")
argument is not numeric or logical: returning NAargument is not numeric or logical: returning NA
out %>% filter(inboth ==1) %>% group_by(clusterNum) %>%
summarise_all("mean")
argument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NA
Conclusion:
As the sample of 679 records from an active subscriber base of 30,000. Of those 679 records. We also received a PayPal data set of 2172 records.
My strategy was to first learn a bit about the data, build out new features (industry indicator, active indicator, in both data sets indicator, site visits/year ratio). Once I built out these features, I chose to run a K-Means clustering algorithm. While the silhouette and elbow method provided the optimal clusters to be 2, I chose to run a variety of numbers and settled on 4 as they were the best at telling a story.
Each cluster identifies a specific type of consumer, where Cluster 2 is the most interesting:
Cluster 1: TLDR: Can’t Target Existing Customers! n = 131 Customers Bad: Current customers Okay: 8 year relationship length, active rate strong Good: Industry indicator strong, 53 site visits per year, 1/week
Cluster 2: TLDR: Majority PayPal Customers, but Net New Customers would be a WIN! n = 2379 Customers 342 customers Net New subscriber base customers with 83% industry indicator Mostly current customers, but once we filter into Net New customers we see a different story Okay: O site visits (about ~1/month), 4 year relationship length
If we assume the 679:30,000 ratio holds, the 342 Net New customers from cluster 2 that have a strong industry indicator could be considered a strong lead, bringing in potentiall 15,000 new customers
Cluster 3: Also, Net New Customers! n = 199 Customers - Bad: 16 year relationship and 0 transactions Okay: 1 site view per week Good: Industry relevant
Cluster 4: Too small to consider
From our final analysis, we want to confirm that current Paypal customers who are subscribers to this magazine have a relatively high level of active transaction rates. We see above that Clusters 1 and 4 have strong industry indicators. Cluster 1 has an activity rate of 0.6 and is most similar to the subset of the Net New 342 customers found in Cluster 2. They are younger customers (4 years vs 8 years) and have less site visits, but with the potential of 15,000 new customers and 0.6 active rate in existing customers with a similar industry indicator, I would argue that this vendor can provide 1000 strong leads.