SEMANA 2

Analisis de Clusters aplicado a People analytics

Cargar las librerias y la base de datos

rm(list = ls()) ## Borrar datos previos

library(tidyverse) ## Manejo de datos
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpubr) ## Presentacion de datos en graficos
library(factoextra) # Es para clasificar datos
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster) ### Analisis de Clusters


library(janitor) ## Mostrar unas tablas
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
set.seed(101) ### Punto de generacion de numeros aleatorios



# Preparando los datos
#### CARGAR LA BASE DE DATOS

Base_People_Analytics <- read.csv2("Base_People_Analytics.csv") ## Leimos la base de datos


datos <- as.data.frame(Base_People_Analytics) ## Convierto la base de datos en DATA FRAME

head(datos)
##   Age Attrition BusinessTravel DailyRate             Department
## 1  36        No  Travel_Rarely      1299 Research & Development
## 2  44        No  Travel_Rarely       477 Research & Development
## 3  44        No  Travel_Rarely      1459 Research & Development
## 4  35        No     Non-Travel      1097 Research & Development
## 5  36        No  Travel_Rarely      1223 Research & Development
## 6  30        No  Travel_Rarely       288 Research & Development
##   DistanceFromHome Education   EducationField EmployeeCount EmployeeNumber
## 1               27         3          Medical             1             13
## 2                7         4          Medical             1             36
## 3               10         4            Other             1             40
## 4               11         2          Medical             1             70
## 5                8         3 Technical Degree             1             83
## 6                2         3    Life Sciences             1            117
##   EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel
## 1                       3   Male         94              3        2
## 2                       1 Female         42              2        3
## 3                       4   Male         41              3        2
## 4                       3   Male         79              2        3
## 5                       3 Female         59              3        3
## 6                       3   Male         99              2        2
##                     JobRole JobSatisfaction MaritalStatus MonthlyIncome
## 1 Healthcare Representative               3       Married          5237
## 2 Healthcare Representative               4       Married         10248
## 3 Healthcare Representative               4       Married          6465
## 4 Healthcare Representative               1       Married          9884
## 5 Healthcare Representative               3      Divorced         10096
## 6 Healthcare Representative               4       Married          4152
##   MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike
## 1       16577                  6      Y       No                13
## 2        2094                  3      Y       No                14
## 3       19121                  2      Y      Yes                13
## 4        8302                  2      Y      Yes                13
## 5        8202                  1      Y       No                13
## 6       15830                  1      Y       No                19
##   PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## 1                 3                        2            80                2
## 2                 3                        4            80                1
## 3                 3                        4            80                0
## 4                 3                        3            80                1
## 5                 3                        2            80                3
## 6                 3                        1            80                3
##   TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 1                17                     3               2              7
## 2                24                     4               3             22
## 3                 9                     5               4              4
## 4                10                     3               3              4
## 5                17                     2               3             17
## 6                11                     3               3             11
##   YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager Extraversion
## 1                  7                       7                    7          3.2
## 2                  6                       5                   17          2.8
## 3                  2                       1                    3          2.9
## 4                  0                       2                    3          3.6
## 5                 14                      12                    8          2.6
## 6                 10                      10                    8          2.8
##   Neuroticism Afability Consciousness Openess
## 1         1.9       3.2           3.1     3.1
## 2         2.9       3.1           2.8     2.6
## 3         4.6       3.8           3.3     4.1
## 4         4.3       3.7           3.4     3.7
## 5         3.2       4.0           2.6     2.2
## 6         3.2       3.4           3.7     3.5

Roles de la empresa

datos %>%  ### Vimos que roles hay en la empresa
  tabyl(JobRole)  
##                    JobRole   n      percent
##                              1 0.0006756757
##                        102   1 0.0006756757
##                        131   1 0.0006756757
##                        145   1 0.0006756757
##                        259   1 0.0006756757
##                        292   1 0.0006756757
##                        326   1 0.0006756757
##                         52   1 0.0006756757
##                         80   1 0.0006756757
##                         83   1 0.0006756757
##  Healthcare Representative 131 0.0885135135
##            Human Resources  52 0.0351351351
##      Laboratory Technician 259 0.1750000000
##                    Manager 102 0.0689189189
##     Manufacturing Director 145 0.0979729730
##          Research Director  80 0.0540540541
##         Research Scientist 292 0.1972972973
##            Sales Executive 326 0.2202702703
##       Sales Representative  83 0.0560810811

Seleccionamos al Ejecutivo de Ventas

datos <- subset(datos, JobRole=="Sales Executive") ## Seleccionar unicamente el Ejecutivo de Ventas

Extraemos los datos de personalidad

### EXTRAER LOS DATOS DE ENCUESTA DE PERSONALIDAD ####

datos <- as.matrix(datos[c("Extraversion","Neuroticism","Afability", 
                           "Consciousness", "Openess")])


datos <- as.data.frame(datos) ## cREE UNA BASE CON SOLO LOS DATOS DE PERSONALIDAD
head(datos)
##      Extraversion Neuroticism Afability Consciousness Openess
## 1062          3.0         3.0       3.5           3.3     3.7
## 1063          2.3         4.2       3.0           3.0     3.8
## 1064          3.4         2.4       3.3           3.6     3.0
## 1065          2.8         2.7       2.9           3.3     3.0
## 1066          3.3         2.5       3.4           3.8     3.5
## 1067          2.8         3.1       3.5           3.0     3.0

Identifica el Numero de Cluster

######################################################
#Identificando el número de clústeres
#######################################################


### Vamos a identificar el posible numero de familias (Clusters)
fviz_nbclust(x = datos, FUNcluster = kmeans, method = "wss", k.max = 15, 
             diss = get_dist(datos, method = "euclidean"), nstart = 50)

Analisis de Clusters

#Análisis de clústeres de k medias
### ORGANIZARLOS EN 6 FAMILIAS (CLUSTERS)
km_clusters <- kmeans(x = datos[, c("Extraversion", "Neuroticism",
                                    "Afability", "Consciousness",
                                    "Openess")], centers = 6, nstart = 50)
km_clusters
## K-means clustering with 6 clusters of sizes 53, 50, 45, 41, 75, 62
## 
## Cluster means:
##   Extraversion Neuroticism Afability Consciousness  Openess
## 1     2.983019    3.728302  3.166038      2.930189 2.930189
## 2     3.456000    2.894000  3.474000      3.234000 3.476000
## 3     3.206667    3.953333  3.340000      3.557778 3.566667
## 4     2.956098    2.192683  3.139024      3.063415 3.570732
## 5     2.909333    3.138667  3.196000      3.209333 3.436000
## 6     3.074194    2.606452  3.066129      3.056452 2.883871
## 
## Clustering vector:
## 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 
##    5    3    6    6    2    5    5    5    3    1    4    3    5    3    3    6 
## 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 
##    3    4    1    1    5    6    3    2    2    5    1    1    4    2    5    2 
## 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 
##    3    2    5    6    6    1    5    4    6    2    6    3    1    5    6    2 
## 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 
##    2    4    3    4    3    2    1    3    5    3    6    4    4    2    2    2 
## 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 
##    4    3    5    5    1    6    1    5    4    2    5    3    2    3    1    1 
## 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 
##    5    1    1    5    5    1    4    1    5    2    4    5    5    2    1    2 
## 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 
##    6    6    1    4    6    1    6    5    2    6    2    5    3    6    2    4 
## 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 
##    4    5    3    2    6    6    3    4    2    2    5    2    1    3    6    1 
## 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 
##    3    5    5    6    4    6    6    4    1    6    6    5    5    2    6    5 
## 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 
##    4    1    5    5    1    3    4    4    6    1    4    5    3    6    1    5 
## 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 
##    5    3    6    5    3    2    1    2    5    3    6    5    3    2    2    1 
## 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 
##    2    2    2    2    5    2    1    1    1    3    1    5    1    3    6    3 
## 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 
##    1    5    6    6    5    6    1    1    4    4    3    1    5    6    6    6 
## 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 
##    2    4    6    6    3    5    5    3    1    4    6    4    4    2    5    2 
## 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 
##    4    6    1    5    3    3    4    4    1    6    1    6    1    4    1    6 
## 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 
##    3    1    4    2    2    6    4    6    1    1    5    3    6    2    6    5 
## 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 
##    1    4    6    6    5    5    6    3    2    5    5    5    6    5    5    1 
## 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 
##    5    3    5    6    2    2    5    5    5    5    5    1    5    5    5    6 
## 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 
##    4    2    2    6    6    6    5    1    4    6    3    5    3    2    6    1 
## 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 
##    4    4    6    5    5    2    6    3    3    1    2    3    4    4    5    1 
## 1382 1383 1384 1385 1386 1387 
##    3    5    6    5    2    5 
## 
## Within cluster sum of squares by cluster:
## [1] 26.66453 21.03100 23.29778 15.90634 25.64640 25.13129
##  (between_SS / total_SS =  53.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
sil <- silhouette(km_clusters$cluster, dist(datos)) ###  REPRESENTACION DE LAS FAMILIAS


fviz_silhouette(sil) ## REPRESENTACION DEL TAMAÑO DE LOS CLUSTERES
##   cluster size ave.sil.width
## 1       1   53          0.13
## 2       2   50          0.13
## 3       3   45          0.19
## 4       4   41          0.22
## 5       5   75          0.19
## 6       6   62          0.18

Primera representacion de los clusters

# Visualizaciones
### rEPRESENTACION DE LOS CLUSTERS Y SUS DISTANCIAS AL CENTROIDE
fviz_cluster(object = km_clusters, data = datos, show.clust.cent = TRUE,
             ellipse.type = "euclid", star.plot = TRUE, repel = TRUE,
             pointsize=0.5,outlier.color= "Blue") +
  labs(title = "Resultados clustering K-means") +
  theme_bw() +  
  theme(legend.position = "none")

Ajuste de Datos

### ARREGLO DE DATOS
km_clusters$cluster <- as.factor(km_clusters$cluster)
km_clusters$cluster
## 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 
##    5    3    6    6    2    5    5    5    3    1    4    3    5    3    3    6 
## 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 
##    3    4    1    1    5    6    3    2    2    5    1    1    4    2    5    2 
## 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 
##    3    2    5    6    6    1    5    4    6    2    6    3    1    5    6    2 
## 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 
##    2    4    3    4    3    2    1    3    5    3    6    4    4    2    2    2 
## 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 
##    4    3    5    5    1    6    1    5    4    2    5    3    2    3    1    1 
## 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 
##    5    1    1    5    5    1    4    1    5    2    4    5    5    2    1    2 
## 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 
##    6    6    1    4    6    1    6    5    2    6    2    5    3    6    2    4 
## 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 
##    4    5    3    2    6    6    3    4    2    2    5    2    1    3    6    1 
## 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 
##    3    5    5    6    4    6    6    4    1    6    6    5    5    2    6    5 
## 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 
##    4    1    5    5    1    3    4    4    6    1    4    5    3    6    1    5 
## 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 
##    5    3    6    5    3    2    1    2    5    3    6    5    3    2    2    1 
## 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 
##    2    2    2    2    5    2    1    1    1    3    1    5    1    3    6    3 
## 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 
##    1    5    6    6    5    6    1    1    4    4    3    1    5    6    6    6 
## 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 
##    2    4    6    6    3    5    5    3    1    4    6    4    4    2    5    2 
## 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 
##    4    6    1    5    3    3    4    4    1    6    1    6    1    4    1    6 
## 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 
##    3    1    4    2    2    6    4    6    1    1    5    3    6    2    6    5 
## 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 
##    1    4    6    6    5    5    6    3    2    5    5    5    6    5    5    1 
## 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 
##    5    3    5    6    2    2    5    5    5    5    5    1    5    5    5    6 
## 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 
##    4    2    2    6    6    6    5    1    4    6    3    5    3    2    6    1 
## 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 
##    4    4    6    5    5    2    6    3    3    1    2    3    4    4    5    1 
## 1382 1383 1384 1385 1386 1387 
##    3    5    6    5    2    5 
## Levels: 1 2 3 4 5 6
clusters <- c(km_clusters$cluster)

datos <- cbind(datos, clusters)

colnames(datos)[6] <- "Cluster"

datos$Cluster <- factor(datos$Cluster,
                        levels = c(1,2,3,4,5,6),
                        labels = c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5", "Cluster 6"))

datos <- as.data.frame(datos)

Analisis de Cluster Definitivo

##########################################################
### HACER EL ANALISIS DEFINITIVO DE CLUSTERS##
############################################################

require(cluster) # Cargo la aplicacion para hacer el analisis
pam.res <- pam(datos, 6) ## Un analisis de 6 clusters

fviz_cluster(pam.res, geom = "point", ellipse.type = "norm",
             show.clust.cent = TRUE,star.plot = TRUE)+
  labs(title = "Resultados clustering K-means")+ theme_bw()

Tabla de resultados

Con esta tabla es que debe realizar la actividad de la semana 2

######  TABLA DE RESULTADOS ######
km_clusters$centers
##   Extraversion Neuroticism Afability Consciousness  Openess
## 1     2.983019    3.728302  3.166038      2.930189 2.930189
## 2     3.456000    2.894000  3.474000      3.234000 3.476000
## 3     3.206667    3.953333  3.340000      3.557778 3.566667
## 4     2.956098    2.192683  3.139024      3.063415 3.570732
## 5     2.909333    3.138667  3.196000      3.209333 3.436000
## 6     3.074194    2.606452  3.066129      3.056452 2.883871