Importando librerias
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readxl)
## Warning: package 'readxl' was built under R version 4.1.3
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
library(ggplot2)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.1.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
Importando las bases de datos
quejas <- read_excel("C:/Users/maxwi/Desktop/ClaimsData2018.xlsx")
resumen <- read_excel("C:/Users/maxwi/Desktop/TransactionsSummary2018.xlsx")
Uniendo las bases de datos
#Unir las bases de datos con la columna ClaimID
bdc <- merge(quejas, resumen, by= "ClaimID", all= TRUE)
#Base de datos sólo hombres
bdc_hombres <- subset(bdc, Gender == "Male")
Entendimiento y limpieza de la base de datos
summary(bdc_hombres)
## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Length:65125 Length:65125 Length:65125
## 1st Qu.: 811585 Class :character Class :character Class :character
## Median : 845656 Mode :character Mode :character Mode :character
## Mean :10174362
## 3rd Qu.:22721079
## Max. :62203891
##
## IndemnityPaid OtherPaid ClaimStatus
## Length:65125 Length:65125 Length:65125
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## IncidentDate IncidentDescription
## Min. :1969-06-02 00:00:00 Length:65125
## 1st Qu.:1999-08-16 00:00:00 Class :character
## Median :2004-03-10 00:00:00 Mode :character
## Mean :2004-04-07 12:27:37
## 3rd Qu.:2009-01-27 00:00:00
## Max. :2014-06-27 00:00:00
##
## ReturnToWorkDate AverageWeeklyWage ClaimantOpenedDate
## Min. :1976-10-29 00:00:00 Length:65125 Min. :1980-07-03 00:00:00
## 1st Qu.:2001-03-07 00:00:00 Class :character 1st Qu.:1999-09-27 00:00:00
## Median :2007-04-05 00:00:00 Mode :character Median :2004-04-22 00:00:00
## Mean :2006-03-06 11:39:51 Mean :2004-05-25 16:07:39
## 3rd Qu.:2011-05-28 00:00:00 3rd Qu.:2009-04-02 00:00:00
## Max. :2014-12-13 00:00:00 Max. :2014-06-30 00:00:00
## NA's :28982
## ClaimantClosedDate EmployerNotificationDate
## Min. :1999-06-01 00:00:00 Min. :1972-09-10 00:00:00
## 1st Qu.:2005-03-31 00:00:00 1st Qu.:1999-11-08 00:00:00
## Median :2006-05-04 00:00:00 Median :2004-08-27 00:00:00
## Mean :2007-07-21 03:06:41 Mean :2005-12-16 20:44:36
## 3rd Qu.:2009-12-17 00:00:00 3rd Qu.:2009-09-17 00:00:00
## Max. :2014-06-30 00:00:00 Max. :9999-07-21 00:00:00
## NA's :2316 NA's :11554
## ReceivedDate IsDenied ClaimantAge_at_DOI
## Min. :1980-07-03 00:00:00 Length:65125 Length:65125
## 1st Qu.:1999-10-04 00:00:00 Class :character Class :character
## Median :2004-04-19 00:00:00 Mode :character Mode :character
## Mean :2004-12-30 06:26:39
## 3rd Qu.:2009-02-20 00:00:00
## Max. :9999-01-21 00:00:00
##
## Gender ClaimantType InjuryNature BodyPartRegion
## Length:65125 Length:65125 Length:65125 Length:65125
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BodyPart BillReviewALE Hospital PhysicianOutpatient
## Length:65125 Min. : -80.00 Min. : -22.9 Min. : -549.5
## Class :character 1st Qu.: 8.25 1st Qu.: 202.4 1st Qu.: 107.5
## Mode :character Median : 24.00 Median : 582.1 Median : 223.2
## Mean : 200.47 Mean : 4916.0 Mean : 1759.4
## 3rd Qu.: 65.64 3rd Qu.: 2337.9 3rd Qu.: 695.3
## Max. :56475.30 Max. :2759604.0 Max. :1219766.6
## NA's :53799 NA's :55514 NA's :41065
## Rx
## Min. : -469.5
## 1st Qu.: 22.1
## Median : 59.6
## Mean : 1637.2
## 3rd Qu.: 185.5
## Max. :631635.5
## NA's :56154
#Revisando tipo de variable
str(bdc_hombres)
## 'data.frame': 65125 obs. of 26 variables:
## $ ClaimID : num 650915 650916 650917 650918 650920 ...
## $ TotalPaid : chr "11947.55" "0.00" "9295.89" "1026.29" ...
## $ TotalReserves : chr "0.00" "0.00" "0.00" "0.00" ...
## $ TotalRecovery : chr "0.00" "0.00" "0.00" "0.00" ...
## $ IndemnityPaid : chr "243.65" "0.00" "0.00" "0.00" ...
## $ OtherPaid : chr "11703.90" "0.00" "9295.89" "1026.29" ...
## $ ClaimStatus : chr "C" "C" "C" "C" ...
## $ IncidentDate : POSIXct, format: "2009-06-17" "2009-06-26" ...
## $ IncidentDescription : chr "Employee was moving concrete rings and installing a meter. He strained lower back." "Employee was pulling lining. He felt a pop in the back causing a strain." "Employee was in the restroom. He heard a scream from another restroom that startled him and he fell on his left"| __truncated__ "Employee was unloading truck using a pallet jack to unload heavy equipment. Heavy load caused him to lose balan"| __truncated__ ...
## $ ReturnToWorkDate : POSIXct, format: "2009-12-08" "2009-06-26" ...
## $ AverageWeeklyWage : chr "639.59" "NULL" "1649.00" "NULL" ...
## $ ClaimantOpenedDate : POSIXct, format: "2009-07-02" "2009-07-02" ...
## $ ClaimantClosedDate : POSIXct, format: "2010-07-20" "2009-11-25" ...
## $ EmployerNotificationDate: POSIXct, format: "2009-06-29" "2009-07-01" ...
## $ ReceivedDate : POSIXct, format: "2009-07-02" "2009-07-02" ...
## $ IsDenied : chr "0" "0" "0" "0" ...
## $ ClaimantAge_at_DOI : chr "49" "49" "47" "61" ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ ClaimantType : chr "Indemnity" "Medical Only" "Indemnity" "Medical Only" ...
## $ InjuryNature : chr "Strain" "Strain" "Fracture" "Contusion" ...
## $ BodyPartRegion : chr "Trunk" "Trunk" "Upper Extremities" "Upper Extremities" ...
## $ BodyPart : chr "Lower Back Area" "Lower Back Area" "Hand" "Shoulder(S)" ...
## $ BillReviewALE : num NA NA NA NA NA ...
## $ Hospital : num NA NA NA NA NA ...
## $ PhysicianOutpatient : num NA NA NA NA NA ...
## $ Rx : num NA NA NA NA NA ...
#Imputando la base de datos
numeric_cols <- sapply(bdc_hombres, is.character)
bdc_hombres[numeric_cols] <- lapply(bdc_hombres[numeric_cols], as.numeric)
## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción
## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción
## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción
## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción
## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción
## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción
## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción
## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción
## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción
date_cols <- sapply(bdc_hombres, inherits, "POSIXct")
bdc_hombres[date_cols] <- lapply(bdc_hombres[date_cols], as.numeric)
bdc_imputed <- bdc_hombres
for (col in names(bdc_hombres)) {
bdc_imputed[[col]][is.na(bdc_hombres[[col]])] <- mean(bdc_hombres[[col]], na.rm = TRUE)
}
sum(is.na(bdc_imputed))
## [1] 455875
#Quitando NAs restantes
na_cols <- sapply(bdc_imputed, function(x) all(is.na(x)))
bdc_imputed <- bdc_imputed[, !na_cols]
sum(is.na(bdc_imputed))
## [1] 0
#Revisando la base de datos imputada
summary(bdc_imputed)
## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Min. : -270 Min. : 0 Min. : 0
## 1st Qu.: 811585 1st Qu.: 75 1st Qu.: 0 1st Qu.: 0
## Median : 845656 Median : 250 Median : 0 Median : 0
## Mean :10174362 Mean : 7138 Mean : 2303 Mean : 91
## 3rd Qu.:22721079 3rd Qu.: 976 3rd Qu.: 0 3rd Qu.: 0
## Max. :62203891 Max. :4527291 Max. :1625903 Max. :130541
## IndemnityPaid OtherPaid IncidentDate ReturnToWorkDate
## Min. : -475 Min. : -7820 Min. : -18403200 Min. :2.154e+08
## 1st Qu.: 0 1st Qu.: 73 1st Qu.: 934761600 1st Qu.:1.142e+09
## Median : 0 Median : 245 Median :1078876800 Median :1.142e+09
## Mean : 3214 Mean : 3924 Mean :1081340858 Mean :1.142e+09
## 3rd Qu.: 0 3rd Qu.: 896 3rd Qu.:1233014400 3rd Qu.:1.205e+09
## Max. :640732 Max. :4129915 Max. :1403827200 Max. :1.418e+09
## AverageWeeklyWage ClaimantOpenedDate ClaimantClosedDate
## Min. : 0.0 Min. :3.314e+08 Min. :9.282e+08
## 1st Qu.: 627.1 1st Qu.:9.384e+08 1st Qu.:1.112e+09
## Median : 673.0 Median :1.083e+09 Median :1.159e+09
## Mean : 673.0 Mean :1.086e+09 Mean :1.185e+09
## 3rd Qu.: 673.0 3rd Qu.:1.239e+09 3rd Qu.:1.257e+09
## Max. :2024000.0 Max. :1.404e+09 Max. :1.404e+09
## EmployerNotificationDate ReceivedDate IsDenied
## Min. :8.493e+07 Min. :3.314e+08 Min. :0.00000
## 1st Qu.:9.717e+08 1st Qu.:9.390e+08 1st Qu.:0.00000
## Median :1.135e+09 Median :1.082e+09 Median :0.00000
## Mean :1.135e+09 Mean :1.104e+09 Mean :0.03995
## 3rd Qu.:1.215e+09 3rd Qu.:1.235e+09 3rd Qu.:0.00000
## Max. :2.534e+11 Max. :2.534e+11 Max. :1.00000
## ClaimantAge_at_DOI BillReviewALE Hospital PhysicianOutpatient
## Min. :-7951.00 Min. : -80.0 Min. : -22.9 Min. : -549.5
## 1st Qu.: 36.00 1st Qu.: 200.5 1st Qu.: 4916.0 1st Qu.: 433.9
## Median : 39.93 Median : 200.5 Median : 4916.0 Median : 1759.4
## Mean : 39.93 Mean : 200.5 Mean : 4916.0 Mean : 1759.4
## 3rd Qu.: 47.00 3rd Qu.: 200.5 3rd Qu.: 4916.0 3rd Qu.: 1759.4
## Max. : 94.00 Max. :56475.3 Max. :2759604.0 Max. :1219766.6
## Rx
## Min. : -469.5
## 1st Qu.: 1637.2
## Median : 1637.2
## Mean : 1637.2
## 3rd Qu.: 1637.2
## Max. :631635.5
#Buscando columnas sin varianza
zero_var_cols <- sapply(bdc_imputed, function(x) var(x) == 0)
#Eliminando esas columnas
bdc_imputed_no_zero_var <- bdc_imputed[, !zero_var_cols]
Asignando cantidad de clusters
grupos <- 7
segmentos <- kmeans(bdc_imputed_no_zero_var,grupos)
asignacion <- cbind(bdc_imputed_no_zero_var, cluster=segmentos$cluster)
Graficando resultados
fviz_cluster(segmentos, bdc_imputed_no_zero_var)
Submuestreando datos para encontrar número óptimo de clusters
#Semilla
set.seed(123)
#Número de filas para el subconjunto
n_sub = 5000
indices = sample(1:nrow(bdc_imputed_no_zero_var), n_sub)
#Subconjunto
bdc_sub = bdc_imputed_no_zero_var[indices, ]
#Número óptimo de grupos
optimizacion <- clusGap(bdc_sub, FUN=kmeans, nstart=1, K.max = 7)
## Warning: did not converge in 10 iterations
#Graficar
plot(optimizacion, xlab="Número de clusters K")