Importando librerias

library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readxl)
## Warning: package 'readxl' was built under R version 4.1.3
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
library(ggplot2)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.1.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

Importando las bases de datos

quejas <- read_excel("C:/Users/maxwi/Desktop/ClaimsData2018.xlsx")

resumen <- read_excel("C:/Users/maxwi/Desktop/TransactionsSummary2018.xlsx")

Uniendo las bases de datos

#Unir las bases de datos con la columna ClaimID
bdc <- merge(quejas, resumen, by= "ClaimID", all= TRUE)
#Base de datos sólo hombres
bdc_hombres <- subset(bdc, Gender == "Male")

Entendimiento y limpieza de la base de datos

summary(bdc_hombres)
##     ClaimID          TotalPaid         TotalReserves      TotalRecovery     
##  Min.   :  650915   Length:65125       Length:65125       Length:65125      
##  1st Qu.:  811585   Class :character   Class :character   Class :character  
##  Median :  845656   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :10174362                                                           
##  3rd Qu.:22721079                                                           
##  Max.   :62203891                                                           
##                                                                             
##  IndemnityPaid       OtherPaid         ClaimStatus       
##  Length:65125       Length:65125       Length:65125      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   IncidentDate                 IncidentDescription
##  Min.   :1969-06-02 00:00:00   Length:65125       
##  1st Qu.:1999-08-16 00:00:00   Class :character   
##  Median :2004-03-10 00:00:00   Mode  :character   
##  Mean   :2004-04-07 12:27:37                      
##  3rd Qu.:2009-01-27 00:00:00                      
##  Max.   :2014-06-27 00:00:00                      
##                                                   
##  ReturnToWorkDate              AverageWeeklyWage  ClaimantOpenedDate           
##  Min.   :1976-10-29 00:00:00   Length:65125       Min.   :1980-07-03 00:00:00  
##  1st Qu.:2001-03-07 00:00:00   Class :character   1st Qu.:1999-09-27 00:00:00  
##  Median :2007-04-05 00:00:00   Mode  :character   Median :2004-04-22 00:00:00  
##  Mean   :2006-03-06 11:39:51                      Mean   :2004-05-25 16:07:39  
##  3rd Qu.:2011-05-28 00:00:00                      3rd Qu.:2009-04-02 00:00:00  
##  Max.   :2014-12-13 00:00:00                      Max.   :2014-06-30 00:00:00  
##  NA's   :28982                                                                 
##  ClaimantClosedDate            EmployerNotificationDate     
##  Min.   :1999-06-01 00:00:00   Min.   :1972-09-10 00:00:00  
##  1st Qu.:2005-03-31 00:00:00   1st Qu.:1999-11-08 00:00:00  
##  Median :2006-05-04 00:00:00   Median :2004-08-27 00:00:00  
##  Mean   :2007-07-21 03:06:41   Mean   :2005-12-16 20:44:36  
##  3rd Qu.:2009-12-17 00:00:00   3rd Qu.:2009-09-17 00:00:00  
##  Max.   :2014-06-30 00:00:00   Max.   :9999-07-21 00:00:00  
##  NA's   :2316                  NA's   :11554                
##   ReceivedDate                   IsDenied         ClaimantAge_at_DOI
##  Min.   :1980-07-03 00:00:00   Length:65125       Length:65125      
##  1st Qu.:1999-10-04 00:00:00   Class :character   Class :character  
##  Median :2004-04-19 00:00:00   Mode  :character   Mode  :character  
##  Mean   :2004-12-30 06:26:39                                        
##  3rd Qu.:2009-02-20 00:00:00                                        
##  Max.   :9999-01-21 00:00:00                                        
##                                                                     
##     Gender          ClaimantType       InjuryNature       BodyPartRegion    
##  Length:65125       Length:65125       Length:65125       Length:65125      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    BodyPart         BillReviewALE         Hospital         PhysicianOutpatient
##  Length:65125       Min.   :  -80.00   Min.   :    -22.9   Min.   :   -549.5  
##  Class :character   1st Qu.:    8.25   1st Qu.:    202.4   1st Qu.:    107.5  
##  Mode  :character   Median :   24.00   Median :    582.1   Median :    223.2  
##                     Mean   :  200.47   Mean   :   4916.0   Mean   :   1759.4  
##                     3rd Qu.:   65.64   3rd Qu.:   2337.9   3rd Qu.:    695.3  
##                     Max.   :56475.30   Max.   :2759604.0   Max.   :1219766.6  
##                     NA's   :53799      NA's   :55514       NA's   :41065      
##        Rx          
##  Min.   :  -469.5  
##  1st Qu.:    22.1  
##  Median :    59.6  
##  Mean   :  1637.2  
##  3rd Qu.:   185.5  
##  Max.   :631635.5  
##  NA's   :56154
#Revisando tipo de variable
str(bdc_hombres)
## 'data.frame':    65125 obs. of  26 variables:
##  $ ClaimID                 : num  650915 650916 650917 650918 650920 ...
##  $ TotalPaid               : chr  "11947.55" "0.00" "9295.89" "1026.29" ...
##  $ TotalReserves           : chr  "0.00" "0.00" "0.00" "0.00" ...
##  $ TotalRecovery           : chr  "0.00" "0.00" "0.00" "0.00" ...
##  $ IndemnityPaid           : chr  "243.65" "0.00" "0.00" "0.00" ...
##  $ OtherPaid               : chr  "11703.90" "0.00" "9295.89" "1026.29" ...
##  $ ClaimStatus             : chr  "C" "C" "C" "C" ...
##  $ IncidentDate            : POSIXct, format: "2009-06-17" "2009-06-26" ...
##  $ IncidentDescription     : chr  "Employee was moving concrete rings and installing a meter. He strained lower back." "Employee was pulling lining. He felt a pop in the back causing a strain." "Employee was in the restroom. He heard a scream from another restroom that startled him and he fell on his left"| __truncated__ "Employee was unloading truck using a pallet jack to unload heavy equipment. Heavy load caused him to lose balan"| __truncated__ ...
##  $ ReturnToWorkDate        : POSIXct, format: "2009-12-08" "2009-06-26" ...
##  $ AverageWeeklyWage       : chr  "639.59" "NULL" "1649.00" "NULL" ...
##  $ ClaimantOpenedDate      : POSIXct, format: "2009-07-02" "2009-07-02" ...
##  $ ClaimantClosedDate      : POSIXct, format: "2010-07-20" "2009-11-25" ...
##  $ EmployerNotificationDate: POSIXct, format: "2009-06-29" "2009-07-01" ...
##  $ ReceivedDate            : POSIXct, format: "2009-07-02" "2009-07-02" ...
##  $ IsDenied                : chr  "0" "0" "0" "0" ...
##  $ ClaimantAge_at_DOI      : chr  "49" "49" "47" "61" ...
##  $ Gender                  : chr  "Male" "Male" "Male" "Male" ...
##  $ ClaimantType            : chr  "Indemnity" "Medical Only" "Indemnity" "Medical Only" ...
##  $ InjuryNature            : chr  "Strain" "Strain" "Fracture" "Contusion" ...
##  $ BodyPartRegion          : chr  "Trunk" "Trunk" "Upper Extremities" "Upper Extremities" ...
##  $ BodyPart                : chr  "Lower Back Area" "Lower Back Area" "Hand" "Shoulder(S)" ...
##  $ BillReviewALE           : num  NA NA NA NA NA ...
##  $ Hospital                : num  NA NA NA NA NA ...
##  $ PhysicianOutpatient     : num  NA NA NA NA NA ...
##  $ Rx                      : num  NA NA NA NA NA ...
#Imputando la base de datos
numeric_cols <- sapply(bdc_hombres, is.character)
bdc_hombres[numeric_cols] <- lapply(bdc_hombres[numeric_cols], as.numeric)
## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción

## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción

## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción

## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción

## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción

## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción

## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción

## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción

## Warning in lapply(bdc_hombres[numeric_cols], as.numeric): NAs introducidos por
## coerción
date_cols <- sapply(bdc_hombres, inherits, "POSIXct")
bdc_hombres[date_cols] <- lapply(bdc_hombres[date_cols], as.numeric)
bdc_imputed <- bdc_hombres
for (col in names(bdc_hombres)) {
  bdc_imputed[[col]][is.na(bdc_hombres[[col]])] <- mean(bdc_hombres[[col]], na.rm = TRUE)
}
sum(is.na(bdc_imputed))
## [1] 455875
#Quitando NAs restantes
na_cols <- sapply(bdc_imputed, function(x) all(is.na(x)))
bdc_imputed <- bdc_imputed[, !na_cols]
sum(is.na(bdc_imputed))
## [1] 0
#Revisando la base de datos imputada
summary(bdc_imputed)
##     ClaimID           TotalPaid       TotalReserves     TotalRecovery   
##  Min.   :  650915   Min.   :   -270   Min.   :      0   Min.   :     0  
##  1st Qu.:  811585   1st Qu.:     75   1st Qu.:      0   1st Qu.:     0  
##  Median :  845656   Median :    250   Median :      0   Median :     0  
##  Mean   :10174362   Mean   :   7138   Mean   :   2303   Mean   :    91  
##  3rd Qu.:22721079   3rd Qu.:    976   3rd Qu.:      0   3rd Qu.:     0  
##  Max.   :62203891   Max.   :4527291   Max.   :1625903   Max.   :130541  
##  IndemnityPaid      OtherPaid        IncidentDate        ReturnToWorkDate   
##  Min.   :  -475   Min.   :  -7820   Min.   : -18403200   Min.   :2.154e+08  
##  1st Qu.:     0   1st Qu.:     73   1st Qu.: 934761600   1st Qu.:1.142e+09  
##  Median :     0   Median :    245   Median :1078876800   Median :1.142e+09  
##  Mean   :  3214   Mean   :   3924   Mean   :1081340858   Mean   :1.142e+09  
##  3rd Qu.:     0   3rd Qu.:    896   3rd Qu.:1233014400   3rd Qu.:1.205e+09  
##  Max.   :640732   Max.   :4129915   Max.   :1403827200   Max.   :1.418e+09  
##  AverageWeeklyWage   ClaimantOpenedDate  ClaimantClosedDate 
##  Min.   :      0.0   Min.   :3.314e+08   Min.   :9.282e+08  
##  1st Qu.:    627.1   1st Qu.:9.384e+08   1st Qu.:1.112e+09  
##  Median :    673.0   Median :1.083e+09   Median :1.159e+09  
##  Mean   :    673.0   Mean   :1.086e+09   Mean   :1.185e+09  
##  3rd Qu.:    673.0   3rd Qu.:1.239e+09   3rd Qu.:1.257e+09  
##  Max.   :2024000.0   Max.   :1.404e+09   Max.   :1.404e+09  
##  EmployerNotificationDate  ReceivedDate          IsDenied      
##  Min.   :8.493e+07        Min.   :3.314e+08   Min.   :0.00000  
##  1st Qu.:9.717e+08        1st Qu.:9.390e+08   1st Qu.:0.00000  
##  Median :1.135e+09        Median :1.082e+09   Median :0.00000  
##  Mean   :1.135e+09        Mean   :1.104e+09   Mean   :0.03995  
##  3rd Qu.:1.215e+09        3rd Qu.:1.235e+09   3rd Qu.:0.00000  
##  Max.   :2.534e+11        Max.   :2.534e+11   Max.   :1.00000  
##  ClaimantAge_at_DOI BillReviewALE        Hospital         PhysicianOutpatient
##  Min.   :-7951.00   Min.   :  -80.0   Min.   :    -22.9   Min.   :   -549.5  
##  1st Qu.:   36.00   1st Qu.:  200.5   1st Qu.:   4916.0   1st Qu.:    433.9  
##  Median :   39.93   Median :  200.5   Median :   4916.0   Median :   1759.4  
##  Mean   :   39.93   Mean   :  200.5   Mean   :   4916.0   Mean   :   1759.4  
##  3rd Qu.:   47.00   3rd Qu.:  200.5   3rd Qu.:   4916.0   3rd Qu.:   1759.4  
##  Max.   :   94.00   Max.   :56475.3   Max.   :2759604.0   Max.   :1219766.6  
##        Rx          
##  Min.   :  -469.5  
##  1st Qu.:  1637.2  
##  Median :  1637.2  
##  Mean   :  1637.2  
##  3rd Qu.:  1637.2  
##  Max.   :631635.5
#Buscando columnas sin varianza
zero_var_cols <- sapply(bdc_imputed, function(x) var(x) == 0)

#Eliminando esas columnas
bdc_imputed_no_zero_var <- bdc_imputed[, !zero_var_cols]

Asignando cantidad de clusters

grupos <- 7
segmentos <- kmeans(bdc_imputed_no_zero_var,grupos)
asignacion <- cbind(bdc_imputed_no_zero_var, cluster=segmentos$cluster)

Graficando resultados

fviz_cluster(segmentos, bdc_imputed_no_zero_var)

Submuestreando datos para encontrar número óptimo de clusters

#Semilla
set.seed(123)

#Número de filas para el subconjunto
n_sub = 5000

indices = sample(1:nrow(bdc_imputed_no_zero_var), n_sub)

#Subconjunto
bdc_sub = bdc_imputed_no_zero_var[indices, ]

#Número óptimo de grupos
optimizacion <- clusGap(bdc_sub, FUN=kmeans, nstart=1, K.max = 7)
## Warning: did not converge in 10 iterations
#Graficar
plot(optimizacion, xlab="Número de clusters K")