library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(readxl)
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(cluster)
data <- read.table("marketing_campaign.csv", sep = "\t", header = T)
head(na.omit(data))
## ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer
## 1 5524 1957 Graduation Single 58138 0 0 04-09-2012
## 2 2174 1954 Graduation Single 46344 1 1 08-03-2014
## 3 4141 1965 Graduation Together 71613 0 0 21-08-2013
## 4 6182 1984 Graduation Together 26646 1 0 10-02-2014
## 5 5324 1981 PhD Married 58293 1 0 19-01-2014
## 6 7446 1967 Master Together 62513 0 1 09-09-2013
## Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts
## 1 58 635 88 546 172 88
## 2 38 11 1 6 2 1
## 3 26 426 49 127 111 21
## 4 26 11 4 20 10 3
## 5 94 173 43 118 46 27
## 6 16 520 42 98 0 42
## MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases
## 1 88 3 8 10
## 2 6 2 1 1
## 3 42 1 8 2
## 4 5 2 2 0
## 5 15 5 5 3
## 6 14 2 6 4
## NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## 1 4 7 0 0 0
## 2 2 5 0 0 0
## 3 10 4 0 0 0
## 4 4 6 0 0 0
## 5 6 5 0 0 0
## 6 10 6 0 0 0
## AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
## 1 0 0 0 3 11 1
## 2 0 0 0 3 11 0
## 3 0 0 0 3 11 0
## 4 0 0 0 3 11 0
## 5 0 0 0 3 11 0
## 6 0 0 0 3 11 0
Niños
data$age <- 2021 - data$Year_Birth
data$child <- data$Kidhome + data$Teenhome
Total de gastos
data$total_spent <- data$MntMeatProducts + data$MntFishProducts+ data$MntWines + data$MntFruits + data$MntSweetProducts + data$MntGoldProds
Campaña
data$accepted <- data$AcceptedCmp1 + data$AcceptedCmp2 + data$AcceptedCmp3 + data$AcceptedCmp4 + data$AcceptedCmp5
head(data)
## ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer
## 1 5524 1957 Graduation Single 58138 0 0 04-09-2012
## 2 2174 1954 Graduation Single 46344 1 1 08-03-2014
## 3 4141 1965 Graduation Together 71613 0 0 21-08-2013
## 4 6182 1984 Graduation Together 26646 1 0 10-02-2014
## 5 5324 1981 PhD Married 58293 1 0 19-01-2014
## 6 7446 1967 Master Together 62513 0 1 09-09-2013
## Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts
## 1 58 635 88 546 172 88
## 2 38 11 1 6 2 1
## 3 26 426 49 127 111 21
## 4 26 11 4 20 10 3
## 5 94 173 43 118 46 27
## 6 16 520 42 98 0 42
## MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases
## 1 88 3 8 10
## 2 6 2 1 1
## 3 42 1 8 2
## 4 5 2 2 0
## 5 15 5 5 3
## 6 14 2 6 4
## NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## 1 4 7 0 0 0
## 2 2 5 0 0 0
## 3 10 4 0 0 0
## 4 4 6 0 0 0
## 5 6 5 0 0 0
## 6 10 6 0 0 0
## AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response age child
## 1 0 0 0 3 11 1 64 0
## 2 0 0 0 3 11 0 67 2
## 3 0 0 0 3 11 0 56 0
## 4 0 0 0 3 11 0 37 1
## 5 0 0 0 3 11 0 40 1
## 6 0 0 0 3 11 0 54 1
## total_spent accepted
## 1 1617 0
## 2 27 0
## 3 776 0
## 4 53 0
## 5 422 0
## 6 716 0
str(data)
## 'data.frame': 2240 obs. of 33 variables:
## $ ID : int 5524 2174 4141 6182 5324 7446 965 6177 4855 5899 ...
## $ Year_Birth : int 1957 1954 1965 1984 1981 1967 1971 1985 1974 1950 ...
## $ Education : chr "Graduation" "Graduation" "Graduation" "Graduation" ...
## $ Marital_Status : chr "Single" "Single" "Together" "Together" ...
## $ Income : int 58138 46344 71613 26646 58293 62513 55635 33454 30351 5648 ...
## $ Kidhome : int 0 1 0 1 1 0 0 1 1 1 ...
## $ Teenhome : int 0 1 0 0 0 1 1 0 0 1 ...
## $ Dt_Customer : chr "04-09-2012" "08-03-2014" "21-08-2013" "10-02-2014" ...
## $ Recency : int 58 38 26 26 94 16 34 32 19 68 ...
## $ MntWines : int 635 11 426 11 173 520 235 76 14 28 ...
## $ MntFruits : int 88 1 49 4 43 42 65 10 0 0 ...
## $ MntMeatProducts : int 546 6 127 20 118 98 164 56 24 6 ...
## $ MntFishProducts : int 172 2 111 10 46 0 50 3 3 1 ...
## $ MntSweetProducts : int 88 1 21 3 27 42 49 1 3 1 ...
## $ MntGoldProds : int 88 6 42 5 15 14 27 23 2 13 ...
## $ NumDealsPurchases : int 3 2 1 2 5 2 4 2 1 1 ...
## $ NumWebPurchases : int 8 1 8 2 5 6 7 4 3 1 ...
## $ NumCatalogPurchases: int 10 1 2 0 3 4 3 0 0 0 ...
## $ NumStorePurchases : int 4 2 10 4 6 10 7 4 2 0 ...
## $ NumWebVisitsMonth : int 7 5 4 6 5 6 6 8 9 20 ...
## $ AcceptedCmp3 : int 0 0 0 0 0 0 0 0 0 1 ...
## $ AcceptedCmp4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AcceptedCmp2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Complain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Z_CostContact : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Z_Revenue : int 11 11 11 11 11 11 11 11 11 11 ...
## $ Response : int 1 0 0 0 0 0 0 0 1 0 ...
## $ age : num 64 67 56 37 40 54 50 36 47 71 ...
## $ child : int 0 2 0 1 1 1 1 1 1 2 ...
## $ total_spent : int 1617 27 776 53 422 716 590 169 46 49 ...
## $ accepted : int 0 0 0 0 0 0 0 0 0 1 ...
data_new <- data[c(-1,-2,-6,-7,-8,-10,-11,-12,-13,-14,-15,-21,-22,-23,-24,-25,-27,-28)]
head(data_new )
## Education Marital_Status Income Recency NumDealsPurchases NumWebPurchases
## 1 Graduation Single 58138 58 3 8
## 2 Graduation Single 46344 38 2 1
## 3 Graduation Together 71613 26 1 8
## 4 Graduation Together 26646 26 2 2
## 5 PhD Married 58293 94 5 5
## 6 Master Together 62513 16 2 6
## NumCatalogPurchases NumStorePurchases NumWebVisitsMonth Complain Response age
## 1 10 4 7 0 1 64
## 2 1 2 5 0 0 67
## 3 2 10 4 0 0 56
## 4 0 4 6 0 0 37
## 5 3 6 5 0 0 40
## 6 4 10 6 0 0 54
## child total_spent accepted
## 1 0 1617 0
## 2 2 27 0
## 3 0 776 0
## 4 1 53 0
## 5 1 422 0
## 6 1 716 0
ggplot(data_new, aes(x = Income)) +
geom_histogram(bins = 40, fill = "#adcae6") +
labs(x = "Income", y = "Frequency") +
theme_minimal()
Chequeamos mediante boxplot los cuartiles de ingresos
data_filtered <- data_new[data_new$Income < 180000, ]
ggplot(data = data_filtered, aes(x = "", y = Income)) +
geom_boxplot(fill = "#adcae6", outlier.color = "blue", notch = TRUE) +
coord_flip() +
ggtitle("customer income") +
xlab("") +
ylab("Income") +
theme_minimal()
Grafico de puntos sobre el total gastado sobre el ingreso
ggplot(data = data_filtered, aes(x = Income, y = total_spent)) +
geom_point(color = "#adcae6") +
xlab("Income") +
ylab("spent") +
theme_minimal()
Chequeamos la distribución de la variable edad
ggplot(data = data_new, aes(x = age)) +
geom_histogram(bins = 50, fill = "#adcae6") +
xlab("Age") +
ylab("Frequency") +
theme_minimal()
Veamos la edad en un grafico de cajas por posible presencia de
outliers
median(data_new$age)
## [1] 51
mean(data_new$age)
## [1] 52.1942
ggplot(data = data_new, aes(x = "", y = age)) +
geom_boxplot(fill = "#adcae6", outlier.color = "blue", notch = TRUE) +
coord_flip() +
ggtitle("customer Age") +
xlab("Age") +
ylab("") +
theme_minimal()
El hecho de que la mediana sea levemente mayor que la media nos podria indicar que hay una mayor concentración de consumidores con edades por encima de la mediana (51 años) en comparación con los consumidores más jóvenes.
ggplot(data = data_new, aes(x = total_spent)) +
geom_histogram(bins = 50, fill = "#adcae6") +
xlab("Total Spent") +
ylab("Frequency") +
theme_minimal()
A partir de la varibale creada total_spent veamos los gastos totales de
los consumidores y la posible presencia de outliers
median(data_new$total_spend)
## NULL
mean(data_new$total_spend)
## Warning in mean.default(data_new$total_spend): argument is not numeric or
## logical: returning NA
## [1] NA
boxplot(data_new$total_spent,
main = "total spent by the customer",
xlab = "total",
ylab = "",
col = "#adcae6",
border = "blue",
horizontal = TRUE,
notch = TRUE
)
Grafico de ingresos por educacion
ggplot(data_new, aes(x=Education,y=Income,fill=Education)) + ylim(0,180000) + geom_boxplot(outlier.colour="black", outlier.shape=16, outlier.size=2, notch=T)
## Warning: Removed 25 rows containing non-finite values (`stat_boxplot()`).
Los ingresos mejoran levemente a mayor educación
Grafico de ingresos agrupado por status marital
ggplot(data_new, aes(x=Marital_Status,y=Income,fill=Marital_Status))+ylim(0,180000)+
geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2, notch=T)
## Warning: Removed 25 rows containing non-finite values (`stat_boxplot()`).
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
# eliminar outliers
remove_outliers <- function(data, columns = names(data)) {
for (col in columns) {
if (is.numeric(data[[col]])) {
Q1 <- quantile(data[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(data[[col]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
data <- data[data[[col]] >= lower_bound & data[[col]] <= upper_bound, ]
}
}
return(data)
}
data_selected <- data_new %>%
select("age", "Education" ,"Marital_Status","Income", "Recency","NumDealsPurchases" ,"NumWebPurchases", "NumCatalogPurchases","NumStorePurchases" , "NumWebVisitsMonth","total_spent")%>%
na.omit()
data_selected <- remove_outliers(data_selected)
data_selected <- data_selected %>%
filter(age <= 70, Income <= 130000)
Creamos las variables dummies
dummys <- dummyVars(" ~ .", data = data_selected, fullRank = T)
data_transformed <- data.frame(predict(dummys, newdata = data_selected))
glimpse(data_transformed)
## Rows: 1,968
## Columns: 20
## $ age <dbl> 64, 67, 56, 37, 40, 54, 50, 36, 47, 45, 62, 69,…
## $ EducationBasic <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
## $ EducationGraduation <dbl> 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1,…
## $ EducationMaster <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ EducationPhD <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ Marital_StatusAlone <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Marital_StatusDivorced <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,…
## $ Marital_StatusMarried <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,…
## $ Marital_StatusSingle <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ Marital_StatusTogether <dbl> 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ Marital_StatusWidow <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Marital_StatusYOLO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Income <dbl> 58138, 46344, 71613, 26646, 58293, 62513, 55635…
## $ Recency <dbl> 58, 38, 26, 26, 94, 16, 34, 32, 19, 59, 82, 53,…
## $ NumDealsPurchases <dbl> 3, 2, 1, 2, 5, 2, 4, 2, 1, 1, 1, 3, 1, 3, 2, 1,…
## $ NumWebPurchases <dbl> 8, 1, 8, 2, 5, 6, 7, 4, 3, 2, 3, 6, 1, 3, 2, 4,…
## $ NumCatalogPurchases <dbl> 10, 1, 2, 0, 3, 4, 3, 0, 0, 0, 4, 1, 0, 0, 1, 2…
## $ NumStorePurchases <dbl> 4, 2, 10, 4, 6, 10, 7, 4, 2, 3, 8, 5, 3, 3, 3, …
## $ NumWebVisitsMonth <dbl> 7, 5, 4, 6, 5, 6, 6, 8, 9, 8, 2, 6, 8, 8, 6, 8,…
## $ total_spent <dbl> 1617, 27, 776, 53, 422, 716, 590, 169, 46, 61, …
data_scaled <- scale(data_transformed)
calculate_silhouette <- function(data, n_clusters) {
kmeans_result <- kmeans(data, centers = n_clusters, nstart = 100)
if (length(unique(kmeans_result$cluster)) < 2) {
return(NA)
}
silhouette_scores <- silhouette(kmeans_result$cluster, dist(data))
if (ncol(silhouette_scores) < 2) {
return(NA)
}
silhouette_mean <- mean(silhouette_scores[, "sil_width"])
return(silhouette_mean)
}
for (n_clusters in 1:9) {
silhouette_avg <- calculate_silhouette(data_transformed, n_clusters)
if (is.na(silhouette_avg)) {
cat("Para n_clusters =", n_clusters, "No se puede calcular el Silhouette score\n")
} else {
cat("Para n_clusters =", n_clusters, "El promedio de Silhouette score es:", silhouette_avg, "\n")
}
}
## Para n_clusters = 1 No se puede calcular el Silhouette score
## Para n_clusters = 2 El promedio de Silhouette score es: 0.6227314
## Para n_clusters = 3 El promedio de Silhouette score es: 0.550596
## Para n_clusters = 4 El promedio de Silhouette score es: 0.5390714
## Para n_clusters = 5 El promedio de Silhouette score es: 0.5396294
## Para n_clusters = 6 El promedio de Silhouette score es: 0.5250815
## Para n_clusters = 7 El promedio de Silhouette score es: 0.5130204
## Para n_clusters = 8 El promedio de Silhouette score es: 0.5161952
## Para n_clusters = 9 El promedio de Silhouette score es: 0.5209248
Para n_clusters = 2 El promedio de Silhouette score es: 0.6227314: es el valor más alto entre todas las opciones, lo que sugiere que con 2 clusters, los datos están bien agrupados, con una clara separación entre los clusters.
fviz_nbclust(data_scaled,kmeans,method="wss") + geom_vline(xintercept=2,linetype=2)
Aqui aparece el numero ideal de clusters
Ejecutamos el K means e imprimios los centroides
set.seed(123)
km.res <- kmeans(data_scaled, 2, nstart = 100)
print(km.res$centers)
## age EducationBasic EducationGraduation EducationMaster EducationPhD
## 1 -0.1298240 0.1272381 -0.04419167 0.04631849 -0.06324823
## 2 0.1696995 -0.1663195 0.05776519 -0.06054527 0.08267500
## Marital_StatusAlone Marital_StatusDivorced Marital_StatusMarried
## 1 0.0298843 -0.03612259 0.01631872
## 2 -0.0390633 0.04721768 -0.02133104
## Marital_StatusSingle Marital_StatusTogether Marital_StatusWidow
## 1 0.005157777 0.01060206 -0.02784891
## 2 -0.006741995 -0.01385850 0.03640274
## Marital_StatusYOLO Income Recency NumDealsPurchases NumWebPurchases
## 1 0.02439423 -0.7182696 -0.02011606 0.1049735 -0.5263960
## 2 -0.03188694 0.9388870 0.02629473 -0.1372162 0.6880792
## NumCatalogPurchases NumStorePurchases NumWebVisitsMonth total_spent
## 1 -0.6812476 -0.6745867 0.5048631 -0.7382267
## 2 0.8904936 0.8817868 -0.6599325 0.9649740
print(km.res$size)
## [1] 1115 853
Proporcion de Varianza explicada por el modelo:
print(km.res$betweenss/km.res$totss)
## [1] 0.1679612
fviz_cluster(km.res, data_transformed, geom = "point",ellipse.type = "norm",repel = TRUE)
data_selected$cluster = as.factor(km.res$cluster)
head(data_selected)
## age Education Marital_Status Income Recency NumDealsPurchases
## 1 64 Graduation Single 58138 58 3
## 2 67 Graduation Single 46344 38 2
## 3 56 Graduation Together 71613 26 1
## 4 37 Graduation Together 26646 26 2
## 5 40 PhD Married 58293 94 5
## 6 54 Master Together 62513 16 2
## NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth
## 1 8 10 4 7
## 2 1 1 2 5
## 3 8 2 10 4
## 4 2 0 4 6
## 5 5 3 6 5
## 6 6 4 10 6
## total_spent cluster
## 1 1617 2
## 2 27 1
## 3 776 2
## 4 53 1
## 5 422 1
## 6 716 2
ggplot(data_selected ,aes(x=cluster,y=total_spent,fill=cluster))+
geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2, notch=T)
ggplot(data_selected, aes(x=cluster,y=age,fill=cluster))+
geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2, notch=T)
ggplot(data_selected, aes(x=cluster,y=NumDealsPurchases,fill=cluster))+geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2, notch=T)
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
ggplot(data_selected, aes(x=cluster,y=NumWebPurchases,fill=cluster)) + geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2, notch=T)
ggplot(data_selected, aes(x=cluster,y=NumCatalogPurchases,fill=cluster))+geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2, notch=T)
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
ggplot(data_selected, aes(x=cluster,y=NumStorePurchases,fill=cluster))+
geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2, notch=T)
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
salida <- data_selected %>%
group_by(cluster) %>%
summarise(across(where(is.numeric), list(mean = ~mean(.), sd = ~sd(.), min = ~min(.), max = ~max(.), median = ~median(.))))
salida
## # A tibble: 2 × 46
## cluster age_mean age_sd age_min age_max age_median Income_mean Income_sd
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 49.2 10.3 25 70 48 36030. 12270.
## 2 2 52.4 10.9 26 70 52 70332. 11177.
## # ℹ 38 more variables: Income_min <int>, Income_max <int>, Income_median <int>,
## # Recency_mean <dbl>, Recency_sd <dbl>, Recency_min <int>, Recency_max <int>,
## # Recency_median <int>, NumDealsPurchases_mean <dbl>,
## # NumDealsPurchases_sd <dbl>, NumDealsPurchases_min <int>,
## # NumDealsPurchases_max <int>, NumDealsPurchases_median <int>,
## # NumWebPurchases_mean <dbl>, NumWebPurchases_sd <dbl>,
## # NumWebPurchases_min <int>, NumWebPurchases_max <int>, …
El Cluster 2 tiene un ingreso promedio significativamente más alto, realiza más compras en la web, en catálogos y en tiendas físicas, y gasta significativamente más en general en comparación con el Cluster 1.
El Cluster 1 tiene ingresos más bajos, realiza menos compras en todos los canales, y tiene un gasto total menor.
La recencia es similar en ambos clusters, sugiriendo patrones de compra recientes comparables.
Ambos clusters tienen edades promedio similares, aunque el Cluster 2 es ligeramente mayor en promedio.