rm(list = ls()) ## Borrar datos previos
library(tidyverse) ## Manejo de datos
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpubr) ## Presentacion de datos en graficos
library(factoextra) # Es para clasificar datos
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster) ### Analisis de Clusters
library(janitor) ## Mostrar unas tablas
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
set.seed(101) ### Punto de generacion de numeros aleatorios
# Preparando los datos
#### CARGAR LA BASE DE DATOS
Base_People_Analytics <- read.csv2("Base_People_Analytics.csv") ## Leimos la base de datos
datos <- as.data.frame(Base_People_Analytics) ## Convierto la base de datos en DATA FRAME
head(datos)
## Age Attrition BusinessTravel DailyRate Department
## 1 36 No Travel_Rarely 1299 Research & Development
## 2 44 No Travel_Rarely 477 Research & Development
## 3 44 No Travel_Rarely 1459 Research & Development
## 4 35 No Non-Travel 1097 Research & Development
## 5 36 No Travel_Rarely 1223 Research & Development
## 6 30 No Travel_Rarely 288 Research & Development
## DistanceFromHome Education EducationField EmployeeCount EmployeeNumber
## 1 27 3 Medical 1 13
## 2 7 4 Medical 1 36
## 3 10 4 Other 1 40
## 4 11 2 Medical 1 70
## 5 8 3 Technical Degree 1 83
## 6 2 3 Life Sciences 1 117
## EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel
## 1 3 Male 94 3 2
## 2 1 Female 42 2 3
## 3 4 Male 41 3 2
## 4 3 Male 79 2 3
## 5 3 Female 59 3 3
## 6 3 Male 99 2 2
## JobRole JobSatisfaction MaritalStatus MonthlyIncome
## 1 Healthcare Representative 3 Married 5237
## 2 Healthcare Representative 4 Married 10248
## 3 Healthcare Representative 4 Married 6465
## 4 Healthcare Representative 1 Married 9884
## 5 Healthcare Representative 3 Divorced 10096
## 6 Healthcare Representative 4 Married 4152
## MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike
## 1 16577 6 Y No 13
## 2 2094 3 Y No 14
## 3 19121 2 Y Yes 13
## 4 8302 2 Y Yes 13
## 5 8202 1 Y No 13
## 6 15830 1 Y No 19
## PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## 1 3 2 80 2
## 2 3 4 80 1
## 3 3 4 80 0
## 4 3 3 80 1
## 5 3 2 80 3
## 6 3 1 80 3
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 1 17 3 2 7
## 2 24 4 3 22
## 3 9 5 4 4
## 4 10 3 3 4
## 5 17 2 3 17
## 6 11 3 3 11
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager Extraversion
## 1 7 7 7 3.2
## 2 6 5 17 2.8
## 3 2 1 3 2.9
## 4 0 2 3 3.6
## 5 14 12 8 2.6
## 6 10 10 8 2.8
## Neuroticism Afability Consciousness Openess
## 1 1.9 3.2 3.1 3.1
## 2 2.9 3.1 2.8 2.6
## 3 4.6 3.8 3.3 4.1
## 4 4.3 3.7 3.4 3.7
## 5 3.2 4.0 2.6 2.2
## 6 3.2 3.4 3.7 3.5
datos %>% ### Vimos que roles hay en la empresa
tabyl(JobRole)
## JobRole n percent
## 1 0.0006756757
## 102 1 0.0006756757
## 131 1 0.0006756757
## 145 1 0.0006756757
## 259 1 0.0006756757
## 292 1 0.0006756757
## 326 1 0.0006756757
## 52 1 0.0006756757
## 80 1 0.0006756757
## 83 1 0.0006756757
## Healthcare Representative 131 0.0885135135
## Human Resources 52 0.0351351351
## Laboratory Technician 259 0.1750000000
## Manager 102 0.0689189189
## Manufacturing Director 145 0.0979729730
## Research Director 80 0.0540540541
## Research Scientist 292 0.1972972973
## Sales Executive 326 0.2202702703
## Sales Representative 83 0.0560810811
datos <- subset(datos, JobRole=="Sales Executive") ## Seleccionar unicamente el Ejecutivo de Ventas
### EXTRAER LOS DATOS DE ENCUESTA DE PERSONALIDAD ####
datos <- as.matrix(datos[c("Extraversion","Neuroticism","Afability",
"Consciousness", "Openess")])
datos <- as.data.frame(datos) ## cREE UNA BASE CON SOLO LOS DATOS DE PERSONALIDAD
head(datos)
## Extraversion Neuroticism Afability Consciousness Openess
## 1062 3.0 3.0 3.5 3.3 3.7
## 1063 2.3 4.2 3.0 3.0 3.8
## 1064 3.4 2.4 3.3 3.6 3.0
## 1065 2.8 2.7 2.9 3.3 3.0
## 1066 3.3 2.5 3.4 3.8 3.5
## 1067 2.8 3.1 3.5 3.0 3.0
######################################################
#Identificando el número de clústeres
#######################################################
### Vamos a identificar el posible numero de familias (Clusters)
fviz_nbclust(x = datos, FUNcluster = kmeans, method = "wss", k.max = 15,
diss = get_dist(datos, method = "euclidean"), nstart = 50)
#Análisis de clústeres de k medias
### ORGANIZARLOS EN 6 FAMILIAS (CLUSTERS)
km_clusters <- kmeans(x = datos[, c("Extraversion", "Neuroticism",
"Afability", "Consciousness",
"Openess")], centers = 6, nstart = 50)
km_clusters
## K-means clustering with 6 clusters of sizes 53, 50, 45, 41, 75, 62
##
## Cluster means:
## Extraversion Neuroticism Afability Consciousness Openess
## 1 2.983019 3.728302 3.166038 2.930189 2.930189
## 2 3.456000 2.894000 3.474000 3.234000 3.476000
## 3 3.206667 3.953333 3.340000 3.557778 3.566667
## 4 2.956098 2.192683 3.139024 3.063415 3.570732
## 5 2.909333 3.138667 3.196000 3.209333 3.436000
## 6 3.074194 2.606452 3.066129 3.056452 2.883871
##
## Clustering vector:
## 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077
## 5 3 6 6 2 5 5 5 3 1 4 3 5 3 3 6
## 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093
## 3 4 1 1 5 6 3 2 2 5 1 1 4 2 5 2
## 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109
## 3 2 5 6 6 1 5 4 6 2 6 3 1 5 6 2
## 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125
## 2 4 3 4 3 2 1 3 5 3 6 4 4 2 2 2
## 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141
## 4 3 5 5 1 6 1 5 4 2 5 3 2 3 1 1
## 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157
## 5 1 1 5 5 1 4 1 5 2 4 5 5 2 1 2
## 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173
## 6 6 1 4 6 1 6 5 2 6 2 5 3 6 2 4
## 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189
## 4 5 3 2 6 6 3 4 2 2 5 2 1 3 6 1
## 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205
## 3 5 5 6 4 6 6 4 1 6 6 5 5 2 6 5
## 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221
## 4 1 5 5 1 3 4 4 6 1 4 5 3 6 1 5
## 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237
## 5 3 6 5 3 2 1 2 5 3 6 5 3 2 2 1
## 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253
## 2 2 2 2 5 2 1 1 1 3 1 5 1 3 6 3
## 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269
## 1 5 6 6 5 6 1 1 4 4 3 1 5 6 6 6
## 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285
## 2 4 6 6 3 5 5 3 1 4 6 4 4 2 5 2
## 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301
## 4 6 1 5 3 3 4 4 1 6 1 6 1 4 1 6
## 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317
## 3 1 4 2 2 6 4 6 1 1 5 3 6 2 6 5
## 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333
## 1 4 6 6 5 5 6 3 2 5 5 5 6 5 5 1
## 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349
## 5 3 5 6 2 2 5 5 5 5 5 1 5 5 5 6
## 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365
## 4 2 2 6 6 6 5 1 4 6 3 5 3 2 6 1
## 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381
## 4 4 6 5 5 2 6 3 3 1 2 3 4 4 5 1
## 1382 1383 1384 1385 1386 1387
## 3 5 6 5 2 5
##
## Within cluster sum of squares by cluster:
## [1] 26.66453 21.03100 23.29778 15.90634 25.64640 25.13129
## (between_SS / total_SS = 53.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
sil <- silhouette(km_clusters$cluster, dist(datos)) ### REPRESENTACION DE LAS FAMILIAS
fviz_silhouette(sil) ## REPRESENTACION DEL TAMAÑO DE LOS CLUSTERES
## cluster size ave.sil.width
## 1 1 53 0.13
## 2 2 50 0.13
## 3 3 45 0.19
## 4 4 41 0.22
## 5 5 75 0.19
## 6 6 62 0.18
# Visualizaciones
### rEPRESENTACION DE LOS CLUSTERS Y SUS DISTANCIAS AL CENTROIDE
fviz_cluster(object = km_clusters, data = datos, show.clust.cent = TRUE,
ellipse.type = "euclid", star.plot = TRUE, repel = TRUE,
pointsize=0.5,outlier.color= "Blue") +
labs(title = "Resultados clustering K-means") +
theme_bw() +
theme(legend.position = "none")
### ARREGLO DE DATOS
km_clusters$cluster <- as.factor(km_clusters$cluster)
km_clusters$cluster
## 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077
## 5 3 6 6 2 5 5 5 3 1 4 3 5 3 3 6
## 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093
## 3 4 1 1 5 6 3 2 2 5 1 1 4 2 5 2
## 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109
## 3 2 5 6 6 1 5 4 6 2 6 3 1 5 6 2
## 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125
## 2 4 3 4 3 2 1 3 5 3 6 4 4 2 2 2
## 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141
## 4 3 5 5 1 6 1 5 4 2 5 3 2 3 1 1
## 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157
## 5 1 1 5 5 1 4 1 5 2 4 5 5 2 1 2
## 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173
## 6 6 1 4 6 1 6 5 2 6 2 5 3 6 2 4
## 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189
## 4 5 3 2 6 6 3 4 2 2 5 2 1 3 6 1
## 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205
## 3 5 5 6 4 6 6 4 1 6 6 5 5 2 6 5
## 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221
## 4 1 5 5 1 3 4 4 6 1 4 5 3 6 1 5
## 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237
## 5 3 6 5 3 2 1 2 5 3 6 5 3 2 2 1
## 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253
## 2 2 2 2 5 2 1 1 1 3 1 5 1 3 6 3
## 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269
## 1 5 6 6 5 6 1 1 4 4 3 1 5 6 6 6
## 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285
## 2 4 6 6 3 5 5 3 1 4 6 4 4 2 5 2
## 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301
## 4 6 1 5 3 3 4 4 1 6 1 6 1 4 1 6
## 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317
## 3 1 4 2 2 6 4 6 1 1 5 3 6 2 6 5
## 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333
## 1 4 6 6 5 5 6 3 2 5 5 5 6 5 5 1
## 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349
## 5 3 5 6 2 2 5 5 5 5 5 1 5 5 5 6
## 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365
## 4 2 2 6 6 6 5 1 4 6 3 5 3 2 6 1
## 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381
## 4 4 6 5 5 2 6 3 3 1 2 3 4 4 5 1
## 1382 1383 1384 1385 1386 1387
## 3 5 6 5 2 5
## Levels: 1 2 3 4 5 6
clusters <- c(km_clusters$cluster)
datos <- cbind(datos, clusters)
colnames(datos)[6] <- "Cluster"
datos$Cluster <- factor(datos$Cluster,
levels = c(1,2,3,4,5,6),
labels = c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5", "Cluster 6"))
datos <- as.data.frame(datos)
##########################################################
### HACER EL ANALISIS DEFINITIVO DE CLUSTERS##
############################################################
require(cluster) # Cargo la aplicacion para hacer el analisis
pam.res <- pam(datos, 6) ## Un analisis de 6 clusters
fviz_cluster(pam.res, geom = "point", ellipse.type = "norm",
show.clust.cent = TRUE,star.plot = TRUE)+
labs(title = "Resultados clustering K-means")+ theme_bw()
Con esta tabla es que debe realizar la actividad de la semana 2
###### TABLA DE RESULTADOS ######
km_clusters$centers
## Extraversion Neuroticism Afability Consciousness Openess
## 1 2.983019 3.728302 3.166038 2.930189 2.930189
## 2 3.456000 2.894000 3.474000 3.234000 3.476000
## 3 3.206667 3.953333 3.340000 3.557778 3.566667
## 4 2.956098 2.192683 3.139024 3.063415 3.570732
## 5 2.909333 3.138667 3.196000 3.209333 3.436000
## 6 3.074194 2.606452 3.066129 3.056452 2.883871