El presente documento muestra un ejemplo de segmentación realizada a datos demográficos y transaccionales de clientes de una entidad financiera
## Loading required package: seriation
## Loading required package: cluster
## Loading required package: TSP
## Loading required package: gclus
## Loading required package: grid
## Loading required package: colorspace
## Loading required package: MASS
## Loading required package: mclust
## Package 'mclust' version 4.2
## Loading required package: flexmix
## Loading required package: lattice
##
## Attaching package: 'lattice'
##
## The following object(s) are masked from 'package:seriation':
##
## panel.lines
##
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: gdata
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLSX' (Excel 2007+) files.
##
## gdata: Run the function 'installXLSXsupport()'
## gdata: to automatically download and install the perl
## gdata: libaries needed to support Excel XLS and XLSX formats.
##
## Attaching package: 'gdata'
##
## The following object(s) are masked from 'package:stats4':
##
## nobs
##
## The following object(s) are masked from 'package:utils':
##
## object.size
##
## The following object(s) are masked from 'package:stats':
##
## nobs
seg_training <- read.csv("segmentacion_training.csv")
summary(seg_training)
## codigo Tipodepersona Activos Pasivos
## Min. : 1 Juridica: 715 Min. :0.00e+00 Min. :0.00e+00
## 1st Qu.:2044 Natural :5829 1st Qu.:1.32e+08 1st Qu.:0.00e+00
## Median :4082 Median :4.00e+08 Median :1.00e+07
## Mean :4082 Mean :1.92e+10 Mean :4.31e+09
## 3rd Qu.:6110 3rd Qu.:1.08e+09 3rd Qu.:1.16e+08
## Max. :8180 Max. :2.67e+13 Max. :5.29e+12
##
## Patrimonio Ingresos Egresos CodigoAE
## Min. :-7.91e+09 Min. :0.00e+00 Min. :0.00e+00 REN :2378
## 1st Qu.: 1.10e+08 1st Qu.:2.70e+06 1st Qu.:1.00e+06 AS :2113
## Median : 3.35e+08 Median :6.00e+06 Median :3.00e+06 M : 281
## Mean : 1.49e+10 Mean :5.47e+08 Mean :3.66e+08 L : 261
## 3rd Qu.: 9.02e+08 3rd Qu.:1.46e+07 3rd Qu.:7.00e+06 NAT : 225
## Max. : 2.67e+13 Max. :4.55e+11 Max. :4.76e+11 S : 225
## (Other):1061
## Total
## Min. :1.00e+00
## 1st Qu.:6.58e+05
## Median :4.65e+06
## Mean :2.76e+08
## 3rd Qu.:3.22e+07
## Max. :2.41e+11
##
str(seg_training)
## 'data.frame': 6544 obs. of 9 variables:
## $ codigo : int 1 4 5 7 8 9 10 11 12 13 ...
## $ Tipodepersona: Factor w/ 2 levels "Juridica","Natural": 2 2 2 2 2 2 2 2 2 2 ...
## $ Activos : num 2.38e+08 3.17e+09 6.97e+08 6.68e+08 1.80e+10 ...
## $ Pasivos : num 0 0 0 60559000 0 ...
## $ Patrimonio : num 2.38e+08 3.17e+09 6.97e+08 6.07e+08 1.80e+10 ...
## $ Ingresos : num 2.00e+06 7.50e+06 2.95e+06 2.55e+06 1.10e+08 ...
## $ Egresos : num 2.0e+06 3.0e+06 1.5e+06 1.0e+06 1.5e+07 0.0 3.2e+06 1.0e+06 1.0e+06 1.8e+06 ...
## $ CodigoAE : Factor w/ 24 levels "A","AS","B","C",..: 21 21 21 21 21 21 2 2 21 2 ...
## $ Total : num 7.13e+04 1.54e+08 4.68e+06 5.33e+05 3.19e+09 ...
En la presente sección se muestran los histogramas de las variables numéricas para revisar la distribución de los datos.
par(mfrow = c(2, 3))
p1 <- hist((seg_training$Activos))
p2 <- hist((seg_training$Pasivos))
p3 <- hist((seg_training$Patrimonio))
p4 <- hist((seg_training$Ingresos))
p5 <- hist((seg_training$Egresos))
p6 <- hist((seg_training$Total))
Se muestra, además, la correlación entre las variables numéricas a través de un gráfico de correlación y gráficos de puntos para cada combinación de variables numéricas.
corrgram(seg_training, order = TRUE, upper.panel = panel.pie, text.panel = panel.txt,
main = "Correlación de las variables Numéricas", diag.panel = panel.minmax,
lower.panel = panel.pts)
Adicionalmente se muestran las distribuciones de las variables categóricas que se puedan presentar
par(mfrow = c(2, 1))
barplot(table(seg_training$Tipodepersona), main = "Tipo De Persona")
barplot(table(seg_training$CodigoAE), main = "Actividad Económica")
Dado que las variables numéricas están demasiado concentradas y además la segmentación debe tener variables homogéneas, se utiliza la función logaritmo natural para distribuir mejor los datos y colocarlos en un rango proporcional. Se requieren los siguientes procedimientos
data <- seg_training[, c(3, 4, 5, 6, 7, 9)]
data$Patrimonio[which(data$Patrimonio < 0)] <- 0
summary(data$Patrimonio)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00e+00 1.10e+08 3.35e+08 1.49e+10 9.02e+08 2.67e+13
data <- log(data + 1)
summary(data)
## Activos Pasivos Patrimonio Ingresos
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.:18.7 1st Qu.: 0.0 1st Qu.:18.5 1st Qu.:14.8
## Median :19.8 Median :16.1 Median :19.6 Median :15.6
## Mean :19.8 Mean :11.3 Mean :19.5 Mean :15.7
## 3rd Qu.:20.8 3rd Qu.:18.6 3rd Qu.:20.6 3rd Qu.:16.5
## Max. :30.9 Max. :29.3 Max. :30.9 Max. :26.8
## Egresos Total
## Min. : 0.0 Min. : 0.693
## 1st Qu.:13.8 1st Qu.:13.396
## Median :14.9 Median :15.352
## Mean :14.5 Mean :15.410
## 3rd Qu.:15.8 3rd Qu.:17.288
## Max. :26.9 Max. :26.209
str(data)
## 'data.frame': 6544 obs. of 6 variables:
## $ Activos : num 19.3 21.9 20.4 20.3 23.6 ...
## $ Pasivos : num 0 0 0 17.9 0 ...
## $ Patrimonio: num 19.3 21.9 20.4 20.2 23.6 ...
## $ Ingresos : num 14.5 15.8 14.9 14.8 18.5 ...
## $ Egresos : num 14.5 14.9 14.2 13.8 16.5 ...
## $ Total : num 11.2 18.9 15.4 13.2 21.9 ...
par(mfrow = c(2, 3))
p1 <- hist(data$Activos)
p2 <- hist(data$Pasivos)
p3 <- hist(data$Patrimonio)
p4 <- hist(data$Ingresos)
p5 <- hist(data$Egresos)
p6 <- hist(data$Total)
corrgram(data, order = FALSE, upper.panel = panel.pie, text.panel = panel.txt,
main = "Correlación de las variables Numéricas Normalizadas", diag.panel = panel.minmax,
lower.panel = panel.pts)
set.seed(410681785)
mydata <- data[, c(1, 2, 4, 5, 6)]
wss <- (nrow(mydata) - 1) * sum(apply(mydata, 2, var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata, centers = i)$withinss)
plot(1:15, wss, type = "b", xlab = "Numero De Clusters", ylab = "Suma de Cuadrados Dentro de los Grupos")
# K-Medias
fit <- kmeans(mydata, 7) # Solución con 9 Clusters
aggregate(mydata, by = list(fit$cluster), FUN = mean)
## Group.1 Activos Pasivos Ingresos Egresos Total
## 1 1 17.95 0.017491 14.583 13.92 13.22
## 2 2 19.12 15.524557 14.960 13.25 15.31
## 3 3 20.79 18.515499 16.461 15.69 17.72
## 4 4 20.13 18.085002 16.149 15.41 12.99
## 5 5 23.88 22.368086 20.762 20.13 19.59
## 6 6 17.84 0.356985 9.855 0.00 14.93
## 7 7 19.55 0.006409 15.367 14.62 16.72
aggregate(mydata, by = list(fit$cluster), FUN = sd)
## Group.1 Activos Pasivos Ingresos Egresos Total
## 1 1 2.325 0.3473 1.075 1.020 1.535
## 2 2 1.318 1.7711 1.750 3.831 1.869
## 3 3 1.025 1.5715 1.026 1.121 1.484
## 4 4 1.049 1.5247 1.060 1.169 1.548
## 5 5 1.899 2.0812 1.950 2.107 2.691
## 6 6 2.627 2.2136 6.447 0.000 2.622
## 7 7 1.301 0.2104 1.007 1.078 1.459
mydata <- data.frame(mydata, fit$cluster)
par(mar = c(5, 4, 6, 2))
height <- table(fit$cluster)
mp <- barplot(height, main = "Distribución de clientes en los Clusters")
text(mp, height, labels = format(height, 2), pos = 1, cex = 0.9)
# Gráfico del cluster contra la primera y la segunda componente principal
# Cambiar los parametros puede hacer el gráfico mas fácil de leer
# library->cluster
clusplot(mydata, fit$cluster, color = TRUE, shade = TRUE, labels = 4, lines = 0)
# Gráfico de centroides vs Primero y segundo discriminante library->fpc
plotcluster(mydata, fit$cluster)
seg_test <- read.csv("segmentacion_test.csv")
summary(seg_test)
## codigo Tipodepersona Activos Pasivos
## Min. : 2 Juridica: 199 Min. :0.00e+00 Min. :0.00e+00
## 1st Qu.:2056 Natural :1437 1st Qu.:1.50e+08 1st Qu.:0.00e+00
## Median :4114 Median :4.11e+08 Median :1.36e+07
## Mean :4125 Mean :1.30e+10 Mean :2.88e+09
## 3rd Qu.:6241 3rd Qu.:1.12e+09 3rd Qu.:1.20e+08
## Max. :8178 Max. :4.24e+12 Max. :1.23e+12
##
## Patrimonio Ingresos Egresos CodigoAE
## Min. :-9.10e+07 Min. :0.0e+00 Min. :0.00e+00 REN :582
## 1st Qu.: 1.18e+08 1st Qu.:2.8e+06 1st Qu.:1.00e+06 AS :525
## Median : 3.40e+08 Median :6.0e+06 Median :2.60e+06 M : 84
## Mean : 1.01e+10 Mean :2.0e+10 Mean :1.74e+10 L : 72
## 3rd Qu.: 9.20e+08 3rd Qu.:1.5e+07 3rd Qu.:7.00e+06 S : 61
## Max. : 4.23e+12 Max. :3.2e+13 Max. :2.82e+13 NAT : 54
## (Other):258
## Total
## Min. :2.00e+00
## 1st Qu.:5.93e+05
## Median :4.56e+06
## Mean :2.53e+08
## 3rd Qu.:3.55e+07
## Max. :7.32e+10
##
data_test <- seg_test[, c(3, 4, 5, 6, 7, 9)]
data_test$Patrimonio[which(data_test$Patrimonio < 0)] <- 0
summary(data_test$Patrimonio)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00e+00 1.18e+08 3.40e+08 1.01e+10 9.20e+08 4.23e+12
data_test <- log(data_test + 1)
summary(data_test)
## Activos Pasivos Patrimonio Ingresos
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.:18.8 1st Qu.: 0.0 1st Qu.:18.6 1st Qu.:14.8
## Median :19.8 Median :16.4 Median :19.6 Median :15.6
## Mean :19.8 Mean :11.6 Mean :19.6 Mean :15.7
## 3rd Qu.:20.8 3rd Qu.:18.6 3rd Qu.:20.6 3rd Qu.:16.5
## Max. :29.1 Max. :27.8 Max. :29.1 Max. :31.1
## Egresos Total
## Min. : 0.0 Min. : 1.1
## 1st Qu.:13.8 1st Qu.:13.3
## Median :14.8 Median :15.3
## Mean :14.4 Mean :15.4
## 3rd Qu.:15.8 3rd Qu.:17.4
## Max. :31.0 Max. :25.0
mydata_test <- data_test[, c(1, 2, 4, 5, 6)]
closest.cluster <- function(x) {
cluster.dist <- apply(fit$centers, 1, function(y) sqrt(sum((x - y)^2)))
return(which.min(cluster.dist)[1])
}
clusters2 <- apply(mydata_test, 1, closest.cluster)
str(clusters2)
## int [1:1636] 4 1 1 1 7 1 7 3 4 2 ...
mydata_test <- data.frame(mydata_test, clusters2)
fit$centers
## Activos Pasivos Ingresos Egresos Total
## 1 17.95 0.017491 14.583 13.92 13.22
## 2 19.12 15.524557 14.960 13.25 15.31
## 3 20.79 18.515499 16.461 15.69 17.72
## 4 20.13 18.085002 16.149 15.41 12.99
## 5 23.88 22.368086 20.762 20.13 19.59
## 6 17.84 0.356985 9.855 0.00 14.93
## 7 19.55 0.006409 15.367 14.62 16.72
aggregate(mydata, by = list(fit$cluster), FUN = mean)
## Group.1 Activos Pasivos Ingresos Egresos Total fit.cluster
## 1 1 17.95 0.017491 14.583 13.92 13.22 1
## 2 2 19.12 15.524557 14.960 13.25 15.31 2
## 3 3 20.79 18.515499 16.461 15.69 17.72 3
## 4 4 20.13 18.085002 16.149 15.41 12.99 4
## 5 5 23.88 22.368086 20.762 20.13 19.59 5
## 6 6 17.84 0.356985 9.855 0.00 14.93 6
## 7 7 19.55 0.006409 15.367 14.62 16.72 7
aggregate(mydata_test, by = list(clusters2), FUN = mean)
## Group.1 Activos Pasivos Ingresos Egresos Total clusters2
## 1 1 18.09 0.00 14.625 13.94 13.05 1
## 2 2 19.27 15.65 14.959 12.77 15.27 2
## 3 3 20.83 18.56 16.407 15.70 17.66 3
## 4 4 20.10 18.10 16.151 15.37 12.89 4
## 5 5 23.73 22.23 20.724 20.07 19.66 5
## 6 6 18.10 0.00 9.355 0.00 14.77 6
## 7 7 19.51 0.00 15.472 14.66 16.74 7
aggregate(mydata, by = list(fit$cluster), FUN = sd)
## Group.1 Activos Pasivos Ingresos Egresos Total fit.cluster
## 1 1 2.325 0.3473 1.075 1.020 1.535 0
## 2 2 1.318 1.7711 1.750 3.831 1.869 0
## 3 3 1.025 1.5715 1.026 1.121 1.484 0
## 4 4 1.049 1.5247 1.060 1.169 1.548 0
## 5 5 1.899 2.0812 1.950 2.107 2.691 0
## 6 6 2.627 2.2136 6.447 0.000 2.622 0
## 7 7 1.301 0.2104 1.007 1.078 1.459 0
aggregate(mydata_test, by = list(clusters2), FUN = sd)
## Group.1 Activos Pasivos Ingresos Egresos Total clusters2
## 1 1 1.794 0.000 0.8481 0.9497 1.645 0
## 2 2 1.407 2.042 1.5673 4.4575 2.073 0
## 3 3 1.096 1.490 1.3770 1.2175 1.480 0
## 4 4 1.138 1.542 1.0345 1.0650 1.550 0
## 5 5 1.902 1.911 2.2340 2.2746 2.519 0
## 6 6 2.006 0.000 6.8179 0.0000 2.682 0
## 7 7 1.392 0.000 1.0984 1.1350 1.466 0
par(mar = c(5, 4, 6, 2))
height <- table(clusters2)
mp <- barplot(height, main = "Distribución de clientes en los Clusters")
text(mp, height, labels = format(height, 2), pos = 1, cex = 0.9)
base_segm_trim_anterior = read.xls("base completa segmentacion.xls", sheet = 1,
header = TRUE)
base_segm_trim_actual = read.xls("base completa segmentacion_nueva.xls", sheet = 1,
header = TRUE)
data_trim_actual <- base_segm_trim_actual[, c(3, 4, 5, 6, 7, 9)]
data_trim_actual$Patrimonio[which(data_trim_actual$Patrimonio < 0)] <- 0
data_trim_actual <- log(data_trim_actual + 1)
data_trim_actual <- data_trim_actual[, c(1, 2, 4, 5, 6)]
clusters_trim_actual <- apply(data_trim_actual, 1, closest.cluster)
base_segm_trim_actual <- data.frame(base_segm_trim_actual, clusters_trim_actual)
data_trim_anterior <- base_segm_trim_anterior[, c(3, 4, 5, 6, 7, 9)]
data_trim_anterior$Patrimonio[which(data_trim_anterior$Patrimonio < 0)] <- 0
data_trim_anterior <- log(data_trim_anterior + 1)
data_trim_anterior <- data_trim_anterior[, c(1, 2, 4, 5, 6)]
clusters_trim_anterior <- apply(data_trim_anterior, 1, closest.cluster)
base_segm_trim_anterior <- data.frame(base_segm_trim_anterior, clusters_trim_anterior)
sub_anterior <- base_segm_trim_anterior[, c(1, 10)]
total <- merge(base_segm_trim_actual, sub_anterior, by = "codigo")
# str(total)
cambiados <- total[which(total$clusters_trim_actual != total$clusters_trim_anterior),
]
str(cambiados)
## 'data.frame': 6 obs. of 11 variables:
## $ codigo : int 1 20 21 22 23 43
## $ Tipodepersona : Factor w/ 2 levels "Juridica","Natural": 2 2 2 2 2 2
## $ Activos : num 3.38e+08 3.90e+09 2.23e+09 1.30e+09 3.43e+08 ...
## $ Pasivos : num 1.50e+07 2.92e+08 3.98e+08 9.80e+08 1.76e+08 ...
## $ Patrimonio : num 3.23e+08 3.61e+09 1.83e+09 3.20e+08 1.67e+08 ...
## $ Ingresos : num 7000000 27000000 30500000 5000000 8746083 ...
## $ Egresos : num 3000000 6000000 5500000 3500000 6265166 ...
## $ CodigoAE : Factor w/ 24 levels "A","AS","B","C",..: 21 2 21 2 21 8
## $ Total : num 1.17e+06 5.59e+08 2.50e+08 2.00e+07 1.58e+06 ...
## $ clusters_trim_actual : int 4 3 3 3 4 2
## $ clusters_trim_anterior: int 1 7 7 7 7 4
write.table(cambiados, "cambiados.txt", row.names = FALSE)
cambiados
## codigo Tipodepersona Activos Pasivos Patrimonio Ingresos Egresos
## 1 1 Natural 3.376e+08 15000000 3.226e+08 7000000 3000000
## 20 20 Natural 3.900e+09 292170000 3.608e+09 27000000 6000000
## 21 21 Natural 2.232e+09 397618000 1.835e+09 30500000 5500000
## 22 22 Natural 1.300e+09 980134000 3.199e+08 5000000 3500000
## 23 23 Natural 3.425e+08 175909000 1.666e+08 8746083 6265166
## 43 43 Natural 2.887e+08 67279000 2.214e+08 9102000 500000
## CodigoAE Total clusters_trim_actual clusters_trim_anterior
## 1 REN 1171264 4 1
## 20 AS 559490976 3 7
## 21 REN 250436363 3 7
## 22 AS 20000000 3 7
## 23 REN 1579439 4 7
## 43 G 2744143 2 4