Este es un proyecto proveniente de North Carolina, US, quienes gestionan reclamaciones de compensación laboral para otros clientes. Buscan asesoramiento sobre cómo se puede utilizar el análisis de datos para identificar los principales factores de los costos de las reclamaciones y el tiempo para procesar las reclamaciones. La empresa está interesada en utilizar el análisis de datos para ser eficiente y receptiva ante los reclamantes.

BASE DE DATOS

Importar Bases de Datos

library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(rpart)
library(rpart.plot)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
df1 <- read_xlsx("/Users/benjaminreyessanchez/Downloads/ClaimsData2018.xlsx")
df2 <- read.csv("/Users/benjaminreyessanchez/Downloads/TransactionsSummary2018.csv")

Unir y corregir

df <- left_join(df1, df2, by = "ClaimID")

str(df)
## tibble [134,004 × 26] (S3: tbl_df/tbl/data.frame)
##  $ ClaimID                 : num [1:134004] 650915 650916 650917 650918 650919 ...
##  $ TotalPaid               : chr [1:134004] "11947.55" "0.00" "9295.89" "1026.29" ...
##  $ TotalReserves           : chr [1:134004] "0.00" "0.00" "0.00" "0.00" ...
##  $ TotalRecovery           : chr [1:134004] "0.00" "0.00" "0.00" "0.00" ...
##  $ IndemnityPaid           : chr [1:134004] "243.65" "0.00" "0.00" "0.00" ...
##  $ OtherPaid               : chr [1:134004] "11703.90" "0.00" "9295.89" "1026.29" ...
##  $ ClaimStatus             : chr [1:134004] "C" "C" "C" "C" ...
##  $ IncidentDate            : POSIXct[1:134004], format: "2009-06-17" "2009-06-26" ...
##  $ IncidentDescription     : chr [1:134004] "Employee was moving concrete rings and installing a meter. He strained lower back." "Employee was pulling lining. He felt a pop in the back causing a strain." "Employee was in the restroom. He heard a scream from another restroom that startled him and he fell on his left"| __truncated__ "Employee was unloading truck using a pallet jack to unload heavy equipment. Heavy load caused him to lose balan"| __truncated__ ...
##  $ ReturnToWorkDate        : POSIXct[1:134004], format: "2009-12-08" "2009-06-26" ...
##  $ AverageWeeklyWage       : chr [1:134004] "639.59" "NULL" "1649.00" "NULL" ...
##  $ ClaimantOpenedDate      : POSIXct[1:134004], format: "2009-07-02" "2009-07-02" ...
##  $ ClaimantClosedDate      : POSIXct[1:134004], format: "2010-07-20" "2009-11-25" ...
##  $ EmployerNotificationDate: POSIXct[1:134004], format: "2009-06-29" "2009-07-01" ...
##  $ ReceivedDate            : POSIXct[1:134004], format: "2009-07-02" "2009-07-02" ...
##  $ IsDenied                : chr [1:134004] "0" "0" "0" "0" ...
##  $ ClaimantAge_at_DOI      : chr [1:134004] "49" "49" "47" "61" ...
##  $ Gender                  : chr [1:134004] "Male" "Male" "Male" "Male" ...
##  $ ClaimantType            : chr [1:134004] "Indemnity" "Medical Only" "Indemnity" "Medical Only" ...
##  $ InjuryNature            : chr [1:134004] "Strain" "Strain" "Fracture" "Contusion" ...
##  $ BodyPartRegion          : chr [1:134004] "Trunk" "Trunk" "Upper Extremities" "Upper Extremities" ...
##  $ BodyPart                : chr [1:134004] "Lower Back Area" "Lower Back Area" "Hand" "Shoulder(S)" ...
##  $ BillReviewALE           : num [1:134004] NA NA NA NA NA NA NA NA 16 8 ...
##  $ Hospital                : num [1:134004] NA NA NA NA NA ...
##  $ PhysicianOutpatient     : num [1:134004] NA NA NA NA NA ...
##  $ Rx                      : num [1:134004] NA NA NA NA NA ...
summary(df)
##     ClaimID          TotalPaid         TotalReserves      TotalRecovery     
##  Min.   :  650915   Length:134004      Length:134004      Length:134004     
##  1st Qu.:  811125   Class :character   Class :character   Class :character  
##  Median :  844626   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :10149151                                                           
##  3rd Qu.:22716506                                                           
##  Max.   :62203891                                                           
##                                                                             
##  IndemnityPaid       OtherPaid         ClaimStatus       
##  Length:134004      Length:134004      Length:134004     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   IncidentDate                   IncidentDescription
##  Min.   :1947-02-24 00:00:00.0   Length:134004      
##  1st Qu.:1998-12-21 00:00:00.0   Class :character   
##  Median :2004-01-05 00:00:00.0   Mode  :character   
##  Mean   :2003-12-08 16:48:47.2                      
##  3rd Qu.:2009-02-02 00:00:00.0                      
##  Max.   :2014-06-27 00:00:00.0                      
##                                                     
##  ReturnToWorkDate                 AverageWeeklyWage 
##  Min.   :1976-10-29 00:00:00.00   Length:134004     
##  1st Qu.:2002-04-25 00:00:00.00   Class :character  
##  Median :2007-07-09 00:00:00.00   Mode  :character  
##  Mean   :2006-06-01 15:30:12.00                     
##  3rd Qu.:2011-06-01 00:00:00.00                     
##  Max.   :2015-05-07 00:00:00.00                     
##  NA's   :58637                                      
##  ClaimantOpenedDate              ClaimantClosedDate              
##  Min.   :1947-02-24 00:00:00.0   Min.   :1999-06-01 00:00:00.00  
##  1st Qu.:1999-02-09 00:00:00.0   1st Qu.:2005-03-31 00:00:00.00  
##  Median :2004-02-17 00:00:00.0   Median :2006-04-04 12:00:00.00  
##  Mean   :2004-01-23 11:32:07.5   Mean   :2007-05-24 02:11:30.00  
##  3rd Qu.:2009-04-09 06:00:00.0   3rd Qu.:2009-11-11 00:00:00.00  
##  Max.   :2014-06-30 00:00:00.0   Max.   :2014-06-30 00:00:00.00  
##                                  NA's   :4678                    
##  EmployerNotificationDate         ReceivedDate                  
##  Min.   :1972-09-10 00:00:00.0   Min.   :1947-02-24 00:00:00.0  
##  1st Qu.:2000-03-13 00:00:00.0   1st Qu.:1999-02-09 00:00:00.0  
##  Median :2004-12-28 00:00:00.0   Median :2004-02-13 00:00:00.0  
##  Mean   :2005-08-29 04:49:34.2   Mean   :2004-07-19 11:29:03.0  
##  3rd Qu.:2009-11-03 00:00:00.0   3rd Qu.:2009-02-27 00:00:00.0  
##  Max.   :9999-07-21 00:00:00.0   Max.   :9999-07-21 00:00:00.0  
##  NA's   :22288                                                  
##    IsDenied         ClaimantAge_at_DOI    Gender          ClaimantType      
##  Length:134004      Length:134004      Length:134004      Length:134004     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  InjuryNature       BodyPartRegion       BodyPart         BillReviewALE     
##  Length:134004      Length:134004      Length:134004      Min.   : -456.00  
##  Class :character   Class :character   Class :character   1st Qu.:    8.25  
##  Mode  :character   Mode  :character   Mode  :character   Median :   24.00  
##                                                           Mean   :  189.89  
##                                                           3rd Qu.:   65.20  
##                                                           Max.   :56475.30  
##                                                           NA's   :109798    
##     Hospital         PhysicianOutpatient       Rx          
##  Min.   : -12570.4   Min.   :   -549.5   Min.   :  -469.5  
##  1st Qu.:    202.5   1st Qu.:    107.3   1st Qu.:    22.7  
##  Median :    580.5   Median :    222.7   Median :    60.0  
##  Mean   :   4773.9   Mean   :   1762.7   Mean   :  1459.0  
##  3rd Qu.:   2298.5   3rd Qu.:    689.9   3rd Qu.:   178.3  
##  Max.   :2759604.0   Max.   :1481468.5   Max.   :631635.5  
##  NA's   :113050      NA's   :81900       NA's   :114237
sapply(df, function(x) sum(is.na(x)))
##                  ClaimID                TotalPaid            TotalReserves 
##                        0                        0                        0 
##            TotalRecovery            IndemnityPaid                OtherPaid 
##                        0                        0                        0 
##              ClaimStatus             IncidentDate      IncidentDescription 
##                        0                        0                        0 
##         ReturnToWorkDate        AverageWeeklyWage       ClaimantOpenedDate 
##                    58637                        0                        0 
##       ClaimantClosedDate EmployerNotificationDate             ReceivedDate 
##                     4678                    22288                        0 
##                 IsDenied       ClaimantAge_at_DOI                   Gender 
##                        0                        0                        0 
##             ClaimantType             InjuryNature           BodyPartRegion 
##                        0                        0                        0 
##                 BodyPart            BillReviewALE                 Hospital 
##                        0                   109798                   113050 
##      PhysicianOutpatient                       Rx 
##                    81900                   114237

Podemos Observar lo siguiente: - Tenemos muchos valores como NA. - Una gran parte de las variables están determinadas como carácter y hay que cambiarlas a numérico. - Hay columnas que podemos eliminar que no nos sirven para ninguno de los análisis.

df$TotalPaid <- as.numeric(df$TotalPaid)
df$TotalReserves <- as.numeric(df$TotalReserves)
df$TotalRecovery <- as.numeric(df$TotalRecovery)
df$IndemnityPaid <- as.numeric(df$IndemnityPaid)
df$OtherPaid <- as.numeric(df$OtherPaid)
df$AverageWeeklyWage <- as.numeric(df$AverageWeeklyWage)
## Warning: NAs introduced by coercion
df$ClaimantAge_at_DOI <- as.numeric(df$ClaimantAge_at_DOI)
## Warning: NAs introduced by coercion
summary(df)
##     ClaimID           TotalPaid       TotalReserves     TotalRecovery      
##  Min.   :  650915   Min.   :   -270   Min.   :      0   Min.   :     0.00  
##  1st Qu.:  811125   1st Qu.:     60   1st Qu.:      0   1st Qu.:     0.00  
##  Median :  844626   Median :    235   Median :      0   Median :     0.00  
##  Mean   :10149151   Mean   :   6746   Mean   :   2233   Mean   :    68.88  
##  3rd Qu.:22716506   3rd Qu.:    938   3rd Qu.:      0   3rd Qu.:     0.00  
##  Max.   :62203891   Max.   :4527291   Max.   :2069575   Max.   :130541.03  
##                                                                            
##  IndemnityPaid      OtherPaid       ClaimStatus       
##  Min.   :  -475   Min.   :  -7820   Length:134004     
##  1st Qu.:     0   1st Qu.:     58   Class :character  
##  Median :     0   Median :    230   Mode  :character  
##  Mean   :  3061   Mean   :   3685                     
##  3rd Qu.:     0   3rd Qu.:    855                     
##  Max.   :640732   Max.   :4129915                     
##                                                       
##   IncidentDate                   IncidentDescription
##  Min.   :1947-02-24 00:00:00.0   Length:134004      
##  1st Qu.:1998-12-21 00:00:00.0   Class :character   
##  Median :2004-01-05 00:00:00.0   Mode  :character   
##  Mean   :2003-12-08 16:48:47.2                      
##  3rd Qu.:2009-02-02 00:00:00.0                      
##  Max.   :2014-06-27 00:00:00.0                      
##                                                     
##  ReturnToWorkDate                 AverageWeeklyWage  
##  Min.   :1976-10-29 00:00:00.00   Min.   :      0.0  
##  1st Qu.:2002-04-25 00:00:00.00   1st Qu.:    300.0  
##  Median :2007-07-09 00:00:00.00   Median :    492.0  
##  Mean   :2006-06-01 15:30:12.00   Mean   :    587.3  
##  3rd Qu.:2011-06-01 00:00:00.00   3rd Qu.:    660.4  
##  Max.   :2015-05-07 00:00:00.00   Max.   :2024000.0  
##  NA's   :58637                    NA's   :84924      
##  ClaimantOpenedDate              ClaimantClosedDate              
##  Min.   :1947-02-24 00:00:00.0   Min.   :1999-06-01 00:00:00.00  
##  1st Qu.:1999-02-09 00:00:00.0   1st Qu.:2005-03-31 00:00:00.00  
##  Median :2004-02-17 00:00:00.0   Median :2006-04-04 12:00:00.00  
##  Mean   :2004-01-23 11:32:07.5   Mean   :2007-05-24 02:11:30.00  
##  3rd Qu.:2009-04-09 06:00:00.0   3rd Qu.:2009-11-11 00:00:00.00  
##  Max.   :2014-06-30 00:00:00.0   Max.   :2014-06-30 00:00:00.00  
##                                  NA's   :4678                    
##  EmployerNotificationDate         ReceivedDate                  
##  Min.   :1972-09-10 00:00:00.0   Min.   :1947-02-24 00:00:00.0  
##  1st Qu.:2000-03-13 00:00:00.0   1st Qu.:1999-02-09 00:00:00.0  
##  Median :2004-12-28 00:00:00.0   Median :2004-02-13 00:00:00.0  
##  Mean   :2005-08-29 04:49:34.2   Mean   :2004-07-19 11:29:03.0  
##  3rd Qu.:2009-11-03 00:00:00.0   3rd Qu.:2009-02-27 00:00:00.0  
##  Max.   :9999-07-21 00:00:00.0   Max.   :9999-07-21 00:00:00.0  
##  NA's   :22288                                                  
##    IsDenied         ClaimantAge_at_DOI    Gender          ClaimantType      
##  Length:134004      Min.   :-8000.00   Length:134004      Length:134004     
##  Class :character   1st Qu.:   33.00   Class :character   Class :character  
##  Mode  :character   Median :   42.00   Mode  :character   Mode  :character  
##                     Mean   :   39.85                                        
##                     3rd Qu.:   51.00                                        
##                     Max.   :   94.00                                        
##                     NA's   :45078                                           
##  InjuryNature       BodyPartRegion       BodyPart         BillReviewALE     
##  Length:134004      Length:134004      Length:134004      Min.   : -456.00  
##  Class :character   Class :character   Class :character   1st Qu.:    8.25  
##  Mode  :character   Mode  :character   Mode  :character   Median :   24.00  
##                                                           Mean   :  189.89  
##                                                           3rd Qu.:   65.20  
##                                                           Max.   :56475.30  
##                                                           NA's   :109798    
##     Hospital         PhysicianOutpatient       Rx          
##  Min.   : -12570.4   Min.   :   -549.5   Min.   :  -469.5  
##  1st Qu.:    202.5   1st Qu.:    107.3   1st Qu.:    22.7  
##  Median :    580.5   Median :    222.7   Median :    60.0  
##  Mean   :   4773.9   Mean   :   1762.7   Mean   :  1459.0  
##  3rd Qu.:   2298.5   3rd Qu.:    689.9   3rd Qu.:   178.3  
##  Max.   :2759604.0   Max.   :1481468.5   Max.   :631635.5  
##  NA's   :113050      NA's   :81900       NA's   :114237

Ahora eliminamos los valores negativos que no pueden existir dentro de nuestra base de datos.

df_inicial<-df

df_inicial<- subset(df_inicial, TotalPaid>0)
df_inicial<- subset(df_inicial, IndemnityPaid>0)

summary(df_inicial)
##     ClaimID           TotalPaid       TotalReserves     TotalRecovery     
##  Min.   :  650915   Min.   :     27   Min.   :      0   Min.   :     0.0  
##  1st Qu.:  813720   1st Qu.:   3766   1st Qu.:      0   1st Qu.:     0.0  
##  Median :  847241   Median :  14088   Median :      0   Median :     0.0  
##  Mean   :10104568   Mean   :  44975   Mean   :  15078   Mean   :   407.5  
##  3rd Qu.:22716889   3rd Qu.:  40479   3rd Qu.:      0   3rd Qu.:     0.0  
##  Max.   :62032209   Max.   :4527291   Max.   :2069575   Max.   :130541.0  
##                                                                           
##  IndemnityPaid        OtherPaid       ClaimStatus       
##  Min.   :    15.5   Min.   :  -7820   Length:18277      
##  1st Qu.:  1265.5   1st Qu.:   1608   Class :character  
##  Median :  5663.6   Median :   6773   Mode  :character  
##  Mean   : 22439.6   Mean   :  22535                     
##  3rd Qu.: 17481.1   3rd Qu.:  20426                     
##  Max.   :640732.3   Max.   :4129915                     
##                                                         
##   IncidentDate                    IncidentDescription
##  Min.   :1967-02-06 00:00:00.00   Length:18277       
##  1st Qu.:1998-08-17 00:00:00.00   Class :character   
##  Median :2003-11-13 00:00:00.00   Mode  :character   
##  Mean   :2003-05-16 07:07:44.25                      
##  3rd Qu.:2008-09-15 00:00:00.00                      
##  Max.   :2014-06-18 00:00:00.00                      
##                                                      
##  ReturnToWorkDate                 AverageWeeklyWage
##  Min.   :1984-01-01 00:00:00.00   Min.   :    1.0  
##  1st Qu.:2000-12-10 12:00:00.00   1st Qu.:  438.6  
##  Median :2005-09-12 00:00:00.00   Median :  541.5  
##  Mean   :2005-05-14 09:48:15.77   Mean   :  595.5  
##  3rd Qu.:2010-01-20 00:00:00.00   3rd Qu.:  699.6  
##  Max.   :2014-06-19 00:00:00.00   Max.   :15062.3  
##  NA's   :6417                     NA's   :3918     
##  ClaimantOpenedDate               ClaimantClosedDate              
##  Min.   :1967-02-06 00:00:00.00   Min.   :1999-07-12 00:00:00.00  
##  1st Qu.:1998-11-20 00:00:00.00   1st Qu.:2005-03-31 00:00:00.00  
##  Median :2004-02-17 00:00:00.00   Median :2006-06-05 00:00:00.00  
##  Mean   :2003-09-11 12:50:56.16   Mean   :2007-04-17 09:40:29.75  
##  3rd Qu.:2009-06-16 00:00:00.00   3rd Qu.:2010-04-26 00:00:00.00  
##  Max.   :2014-06-24 00:00:00.00   Max.   :2014-06-30 00:00:00.00  
##                                   NA's   :2262                    
##  EmployerNotificationDate          ReceivedDate                   
##  Min.   :1978-04-07 00:00:00.00   Min.   :1967-02-06 00:00:00.00  
##  1st Qu.:2000-06-23 18:00:00.00   1st Qu.:1998-11-05 00:00:00.00  
##  Median :2005-03-16 12:00:00.00   Median :2004-01-05 00:00:00.00  
##  Mean   :2005-06-15 21:45:38.46   Mean   :2003-10-23 12:36:54.72  
##  3rd Qu.:2009-08-15 12:00:00.00   3rd Qu.:2008-10-01 00:00:00.00  
##  Max.   :9999-07-01 00:00:00.00   Max.   :4996-02-17 00:00:00.00  
##  NA's   :4237                                                     
##    IsDenied         ClaimantAge_at_DOI    Gender          ClaimantType      
##  Length:18277       Min.   :-5973.00   Length:18277       Length:18277      
##  Class :character   1st Qu.:   37.00   Class :character   Class :character  
##  Mode  :character   Median :   45.00   Mode  :character   Mode  :character  
##                     Mean   :   43.56                                        
##                     3rd Qu.:   53.00                                        
##                     Max.   :   87.00                                        
##                     NA's   :5549                                            
##  InjuryNature       BodyPartRegion       BodyPart         BillReviewALE     
##  Length:18277       Length:18277       Length:18277       Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    8.25  
##  Mode  :character   Mode  :character   Mode  :character   Median :   24.00  
##                                                           Mean   :  181.49  
##                                                           3rd Qu.:   64.00  
##                                                           Max.   :15836.77  
##                                                           NA's   :15167     
##     Hospital        PhysicianOutpatient       Rx           
##  Min.   :     0.0   Min.   :   -18.8    Min.   :     0.00  
##  1st Qu.:   185.7   1st Qu.:   104.0    1st Qu.:    21.66  
##  Median :   527.9   Median :   216.2    Median :    56.91  
##  Mean   :  4185.7   Mean   :  1621.1    Mean   :  1388.27  
##  3rd Qu.:  2047.0   3rd Qu.:   655.3    3rd Qu.:   175.90  
##  Max.   :408665.0   Max.   :318039.6    Max.   :187627.17  
##  NA's   :15339      NA's   :10953       NA's   :15523
#Solamente aplicado a dos variables ya que si se hicieran con las 3 variables faltantes que contienen negativos se reducen demasiado el número de obs.

REGRESIÓN LINEAL

No hice una nueva columna para costo total incurrido porque me di cuenta que la variable “TotalPaid” ya es la sumatoria total del costo total de cada paciente, tomando en cuenta TotalReserves, IndemnityPaid y OtherPaid

Determinar Variables y Eliminar Outliers

#PARA EL COSTO TOTAL INCURRIDO

df_regr <- df_inicial
boxplot(df_inicial$TotalPaid, horizontal = TRUE)

df_regr <- df_inicial[df_inicial$TotalPaid > 0 & df_inicial$TotalPaid < 150000, ] 
boxplot(df_regr$TotalPaid, horizontal = TRUE)

df_regr <- subset(df_regr, select = -IncidentDescription)
df_regr$AverageWeeklyWage <- replace(df_regr$AverageWeeklyWage, is.na(df_regr$AverageWeeklyWage), 592.9)
summary(df_regr)
##     ClaimID           TotalPaid         TotalReserves    TotalRecovery    
##  Min.   :  650915   Min.   :    26.88   Min.   :     0   Min.   :    0.0  
##  1st Qu.:  814288   1st Qu.:  3330.54   1st Qu.:     0   1st Qu.:    0.0  
##  Median :  850173   Median : 11847.95   Median :     0   Median :    0.0  
##  Mean   :10434983   Mean   : 23907.09   Mean   :  4933   Mean   :  307.4  
##  3rd Qu.:22717577   3rd Qu.: 31129.08   3rd Qu.:     0   3rd Qu.:    0.0  
##  Max.   :62032209   Max.   :149949.17   Max.   :659235   Max.   :85000.0  
##                                                                           
##  IndemnityPaid        OtherPaid      ClaimStatus       
##  Min.   :    15.5   Min.   : -7820   Length:16857      
##  1st Qu.:  1079.4   1st Qu.:  1413   Class :character  
##  Median :  4664.3   Median :  5614   Mode  :character  
##  Mean   : 11742.7   Mean   : 12164                     
##  3rd Qu.: 13089.5   3rd Qu.: 16237                     
##  Max.   :147420.0   Max.   :137142                     
##                                                        
##   IncidentDate                    ReturnToWorkDate                
##  Min.   :1971-11-24 00:00:00.00   Min.   :1984-01-01 00:00:00.00  
##  1st Qu.:1998-07-18 00:00:00.00   1st Qu.:2000-10-19 18:00:00.00  
##  Median :2003-10-09 00:00:00.00   Median :2005-06-28 00:00:00.00  
##  Mean   :2003-05-21 23:48:48.55   Mean   :2005-03-26 13:16:08.78  
##  3rd Qu.:2008-10-20 00:00:00.00   3rd Qu.:2009-11-18 00:00:00.00  
##  Max.   :2014-06-18 00:00:00.00   Max.   :2014-06-19 00:00:00.00  
##                                   NA's   :5605                    
##  AverageWeeklyWage ClaimantOpenedDate              
##  Min.   :    1.0   Min.   :1971-11-24 00:00:00.00  
##  1st Qu.:  467.0   1st Qu.:1998-09-22 00:00:00.00  
##  Median :  592.9   Median :2003-11-18 00:00:00.00  
##  Mean   :  592.9   Mean   :2003-07-29 11:06:44.26  
##  3rd Qu.:  638.4   3rd Qu.:2009-03-11 00:00:00.00  
##  Max.   :15062.3   Max.   :2014-06-24 00:00:00.00  
##                                                    
##  ClaimantClosedDate               EmployerNotificationDate        
##  Min.   :1999-07-12 00:00:00.00   Min.   :1978-04-07 00:00:00.00  
##  1st Qu.:2005-03-31 00:00:00.00   1st Qu.:2000-06-21 00:00:00.00  
##  Median :2006-03-31 00:00:00.00   Median :2005-04-09 00:00:00.00  
##  Mean   :2007-03-24 08:30:53.72   Mean   :2005-08-15 23:35:58.44  
##  3rd Qu.:2010-03-29 00:00:00.00   3rd Qu.:2009-11-04 18:00:00.00  
##  Max.   :2014-06-30 00:00:00.00   Max.   :9999-07-01 00:00:00.00  
##  NA's   :1445                     NA's   :3971                    
##   ReceivedDate                      IsDenied         ClaimantAge_at_DOI
##  Min.   :1971-11-24 00:00:00.00   Length:16857       Min.   :-5973.00  
##  1st Qu.:1998-09-22 00:00:00.00   Class :character   1st Qu.:   36.00  
##  Median :2003-11-19 00:00:00.00   Mode  :character   Median :   45.00  
##  Mean   :2003-10-25 04:58:07.87                      Mean   :   43.29  
##  3rd Qu.:2008-11-12 00:00:00.00                      3rd Qu.:   53.00  
##  Max.   :4996-02-17 00:00:00.00                      Max.   :   87.00  
##                                                      NA's   :5366      
##     Gender          ClaimantType       InjuryNature       BodyPartRegion    
##  Length:16857       Length:16857       Length:16857       Length:16857      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    BodyPart         BillReviewALE         Hospital      PhysicianOutpatient
##  Length:16857       Min.   :    0.00   Min.   :     0   Min.   :   -18.8   
##  Class :character   1st Qu.:    8.25   1st Qu.:   187   1st Qu.:   105.0   
##  Mode  :character   Median :   24.00   Median :   536   Median :   216.9   
##                     Mean   :  179.20   Mean   :  4292   Mean   :  1662.5   
##                     3rd Qu.:   64.00   3rd Qu.:  2075   3rd Qu.:   656.3   
##                     Max.   :15836.77   Max.   :408665   Max.   :318039.6   
##                     NA's   :14035      NA's   :14200    NA's   :10226      
##        Rx           
##  Min.   :     0.00  
##  1st Qu.:    21.70  
##  Median :    56.38  
##  Mean   :  1326.86  
##  3rd Qu.:   177.70  
##  Max.   :187627.17  
##  NA's   :14338
count(df_regr, ClaimantType, sort = TRUE)
## # A tibble: 3 × 2
##   ClaimantType     n
##   <chr>        <int>
## 1 Indemnity    16821
## 2 Medical Only    26
## 3 Report Only     10
count(df_regr, InjuryNature, sort = TRUE)
## # A tibble: 45 × 2
##    InjuryNature                         n
##    <chr>                            <int>
##  1 Strain                            5184
##  2 Contusion                         2842
##  3 Non-Standard Code                 2392
##  4 Sprain                            1738
##  5 Fracture                          1345
##  6 All Other Specific Injuries, Noc  1162
##  7 Laceration                         411
##  8 Multiple Physical Injuries Only    363
##  9 Carpal Tunnel Syndrome             204
## 10 Dislocation                        136
## # ℹ 35 more rows
count(df_regr, BodyPartRegion, sort = TRUE)
## # A tibble: 7 × 2
##   BodyPartRegion          n
##   <chr>               <int>
## 1 Upper Extremities    4398
## 2 Lower Extremities    4057
## 3 Trunk                3077
## 4 Non-Standard Code    2317
## 5 Multiple Body Parts  1889
## 6 Head                  630
## 7 Neck                  489
count(df_regr, IsDenied, sort = TRUE)
## # A tibble: 2 × 2
##   IsDenied     n
##   <chr>    <int>
## 1 0        16399
## 2 1          458
count(df_regr, Gender, sort = TRUE)
## # A tibble: 3 × 2
##   Gender            n
##   <chr>         <int>
## 1 Male           8023
## 2 Female         7037
## 3 Not Available  1797
count(df_regr, ClaimStatus, sort = TRUE)
## # A tibble: 3 × 2
##   ClaimStatus     n
##   <chr>       <int>
## 1 C           15412
## 2 O            1079
## 3 R             366
#PARA EL TIEMPO DE PROCESAMIENTO

#Utilizando las Variables de las Fechas.

df_regr$processing_time <- as.numeric(df_regr$ClaimantClosedDate - df_regr$ClaimantOpenedDate, units = "days")

head(df_regr$processing_time)
## [1]  383  673  235 2080 2087 2347

Crear Regresión

#PARA EL COSTO TOTAL INCURRIDO

regresion_cti <- lm(TotalPaid ~ Gender + processing_time + AverageWeeklyWage + BodyPartRegion, data = df_regr )
summary (regresion_cti)
## 
## Call:
## lm(formula = TotalPaid ~ Gender + processing_time + AverageWeeklyWage + 
##     BodyPartRegion, data = df_regr)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -81253 -16178  -9085   5727 140331 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        2810.2072  1332.4317   2.109 0.034954 *  
## GenderMale                         1964.1063   466.2457   4.213 2.54e-05 ***
## GenderNot Available                2960.8020  1270.2260   2.331 0.019770 *  
## processing_time                       3.4264     0.2051  16.706  < 2e-16 ***
## AverageWeeklyWage                    15.2510     0.9181  16.612  < 2e-16 ***
## BodyPartRegionLower Extremities    4221.1216  1230.4767   3.430 0.000604 ***
## BodyPartRegionMultiple Body Parts  8048.6200  1313.0744   6.130 9.02e-10 ***
## BodyPartRegionNeck                 6304.1084  1833.5618   3.438 0.000587 ***
## BodyPartRegionNon-Standard Code   -9219.4129  1585.7176  -5.814 6.22e-09 ***
## BodyPartRegionTrunk                3497.3291  1254.7274   2.787 0.005321 ** 
## BodyPartRegionUpper Extremities    4038.4562  1223.0723   3.302 0.000963 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26740 on 15401 degrees of freedom
##   (1445 observations deleted due to missingness)
## Multiple R-squared:  0.04246,    Adjusted R-squared:  0.04184 
## F-statistic:  68.3 on 10 and 15401 DF,  p-value: < 2.2e-16
#PARA EL TIEMPO DE PROCESAMIENTO

regresion_pt <- lm(processing_time ~ Gender + TotalPaid + AverageWeeklyWage + BodyPartRegion + ClaimantType, data = df_regr )
summary (regresion_pt)
## 
## Call:
## lm(formula = processing_time ~ Gender + TotalPaid + AverageWeeklyWage + 
##     BodyPartRegion + ClaimantType, data = df_regr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2925.9  -804.7  -188.7   752.1  7257.2 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        1.766e+03  4.989e+01  35.391  < 2e-16 ***
## GenderMale                         8.950e+01  1.815e+01   4.932 8.23e-07 ***
## GenderNot Available                5.292e+02  4.927e+01  10.741  < 2e-16 ***
## TotalPaid                          5.189e-03  3.109e-04  16.688  < 2e-16 ***
## AverageWeeklyWage                 -1.040e+00  3.507e-02 -29.657  < 2e-16 ***
## BodyPartRegionLower Extremities   -3.004e+01  4.792e+01  -0.627   0.5307    
## BodyPartRegionMultiple Body Parts  4.066e+02  5.107e+01   7.962 1.82e-15 ***
## BodyPartRegionNeck                -1.104e+02  7.140e+01  -1.547   0.1219    
## BodyPartRegionNon-Standard Code    1.353e+03  6.084e+01  22.244  < 2e-16 ***
## BodyPartRegionTrunk                2.646e+01  4.885e+01   0.542   0.5881    
## BodyPartRegionUpper Extremities   -6.006e+01  4.763e+01  -1.261   0.2073    
## ClaimantTypeMedical Only          -5.019e+02  2.045e+02  -2.455   0.0141 *  
## ClaimantTypeReport Only            7.892e+02  3.299e+02   2.392   0.0168 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1041 on 15399 degrees of freedom
##   (1445 observations deleted due to missingness)
## Multiple R-squared:  0.2956, Adjusted R-squared:  0.295 
## F-statistic: 538.4 on 12 and 15399 DF,  p-value: < 2.2e-16

Construir Modelo Predictivo

#PARA EL COSTO TOTAL INCURRIDO

df_cti <- data.frame(Gender="Male", processing_time=2080, AverageWeeklyWage= 1126.85, BodyPartRegion="Upper Extremities")
predict(regresion_cti,df_cti)
##        1 
## 33125.36
#PARA EL TIEMPO DE PROCESAMIENTO

df_pt <- data.frame(Gender="Male", TotalPaid=375.61, AverageWeeklyWage=320, BodyPartRegion="Trunk", ClaimantType="Indemnity")
predict(regresion_pt,df_pt)
##       1 
## 1550.67

Modelo 1: Resultado = 33125.36 / Real= 3873.13 Modelo 2: Resultado = 1550.67 / Real= 2087

ÁRBOL DE DECISIÓN

Bases de Datos y Limpieza

# Extraer Variables de Interés
df_arbol1 <- df_regr[,c("TotalPaid", "processing_time", "Gender", "ClaimantType")]

df_arbol1 <- na.omit(df_arbol1)

# ¿Cúantos NA tengo en la base de datos?
sapply(df_arbol1, function(x) sum(is.na(x)))
##       TotalPaid processing_time          Gender    ClaimantType 
##               0               0               0               0
df_arbol2 <- df_regr[,c("Gender", "ClaimantType", "ClaimStatus", "BodyPartRegion")]
df_arbol2 <- na.omit(df_arbol2)

# ¿Cúantos NA tengo en la base de datos?
sapply(df_arbol2, function(x) sum(is.na(x)))
##         Gender   ClaimantType    ClaimStatus BodyPartRegion 
##              0              0              0              0

Creación del Árbol

#ÁRBOL 1

ad1 <- rpart(formula = Gender ~ ., data = df_arbol1)
ad1
## n= 15412 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 15412 8115 Male (0.41084869 0.47346224 0.11568907)  
##   2) processing_time< 3634.5 14239 7223 Male (0.43781164 0.49273123 0.06945712)  
##     4) processing_time< 1004.5 6209 3124 Male (0.48397488 0.49685940 0.01916573)  
##       8) processing_time< 218.5 1321  546 Female (0.58667676 0.40651022 0.00681302) *
##       9) processing_time>=218.5 4888 2340 Male (0.45621931 0.52127660 0.02250409) *
##     5) processing_time>=1004.5 8030 4099 Male (0.40211706 0.48953923 0.10834371) *
##   3) processing_time>=3634.5 1173  379 Not Available (0.08354646 0.23955669 0.67689685) *
rpart.plot(ad1)

prp(ad1)

#ÁRBOL 2

ad2 <- rpart(formula = Gender ~ ., data = df_arbol2)
ad2
## n= 16857 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 16857 8834 Male (0.417452690 0.475944711 0.106602598)  
##   2) BodyPartRegion=Head,Lower Extremities,Multiple Body Parts,Neck,Trunk,Upper Extremities 14540 7040 Male (0.479229711 0.515818432 0.004951857)  
##     4) BodyPartRegion=Multiple Body Parts 1889  852 Female (0.548967708 0.449973531 0.001058761) *
##     5) BodyPartRegion=Head,Lower Extremities,Neck,Trunk,Upper Extremities 12651 6001 Male (0.468816694 0.525650146 0.005533159) *
##   3) BodyPartRegion=Non-Standard Code 2317  592 Not Available (0.029779888 0.225722918 0.744497195) *
rpart.plot(ad2)

prp(ad2)

ANÁLISIS DE CLUSTERS

Analizar Base de Datos

df_clus <- df_regr

sapply(df_clus, function(x) sum(is.na(x)))
##                  ClaimID                TotalPaid            TotalReserves 
##                        0                        0                        0 
##            TotalRecovery            IndemnityPaid                OtherPaid 
##                        0                        0                        0 
##              ClaimStatus             IncidentDate         ReturnToWorkDate 
##                        0                        0                     5605 
##        AverageWeeklyWage       ClaimantOpenedDate       ClaimantClosedDate 
##                        0                        0                     1445 
## EmployerNotificationDate             ReceivedDate                 IsDenied 
##                     3971                        0                        0 
##       ClaimantAge_at_DOI                   Gender             ClaimantType 
##                     5366                        0                        0 
##             InjuryNature           BodyPartRegion                 BodyPart 
##                        0                        0                        0 
##            BillReviewALE                 Hospital      PhysicianOutpatient 
##                    14035                    14200                    10226 
##                       Rx          processing_time 
##                    14338                     1445
summary(df_clus)
##     ClaimID           TotalPaid         TotalReserves    TotalRecovery    
##  Min.   :  650915   Min.   :    26.88   Min.   :     0   Min.   :    0.0  
##  1st Qu.:  814288   1st Qu.:  3330.54   1st Qu.:     0   1st Qu.:    0.0  
##  Median :  850173   Median : 11847.95   Median :     0   Median :    0.0  
##  Mean   :10434983   Mean   : 23907.09   Mean   :  4933   Mean   :  307.4  
##  3rd Qu.:22717577   3rd Qu.: 31129.08   3rd Qu.:     0   3rd Qu.:    0.0  
##  Max.   :62032209   Max.   :149949.17   Max.   :659235   Max.   :85000.0  
##                                                                           
##  IndemnityPaid        OtherPaid      ClaimStatus       
##  Min.   :    15.5   Min.   : -7820   Length:16857      
##  1st Qu.:  1079.4   1st Qu.:  1413   Class :character  
##  Median :  4664.3   Median :  5614   Mode  :character  
##  Mean   : 11742.7   Mean   : 12164                     
##  3rd Qu.: 13089.5   3rd Qu.: 16237                     
##  Max.   :147420.0   Max.   :137142                     
##                                                        
##   IncidentDate                    ReturnToWorkDate                
##  Min.   :1971-11-24 00:00:00.00   Min.   :1984-01-01 00:00:00.00  
##  1st Qu.:1998-07-18 00:00:00.00   1st Qu.:2000-10-19 18:00:00.00  
##  Median :2003-10-09 00:00:00.00   Median :2005-06-28 00:00:00.00  
##  Mean   :2003-05-21 23:48:48.55   Mean   :2005-03-26 13:16:08.78  
##  3rd Qu.:2008-10-20 00:00:00.00   3rd Qu.:2009-11-18 00:00:00.00  
##  Max.   :2014-06-18 00:00:00.00   Max.   :2014-06-19 00:00:00.00  
##                                   NA's   :5605                    
##  AverageWeeklyWage ClaimantOpenedDate              
##  Min.   :    1.0   Min.   :1971-11-24 00:00:00.00  
##  1st Qu.:  467.0   1st Qu.:1998-09-22 00:00:00.00  
##  Median :  592.9   Median :2003-11-18 00:00:00.00  
##  Mean   :  592.9   Mean   :2003-07-29 11:06:44.26  
##  3rd Qu.:  638.4   3rd Qu.:2009-03-11 00:00:00.00  
##  Max.   :15062.3   Max.   :2014-06-24 00:00:00.00  
##                                                    
##  ClaimantClosedDate               EmployerNotificationDate        
##  Min.   :1999-07-12 00:00:00.00   Min.   :1978-04-07 00:00:00.00  
##  1st Qu.:2005-03-31 00:00:00.00   1st Qu.:2000-06-21 00:00:00.00  
##  Median :2006-03-31 00:00:00.00   Median :2005-04-09 00:00:00.00  
##  Mean   :2007-03-24 08:30:53.72   Mean   :2005-08-15 23:35:58.44  
##  3rd Qu.:2010-03-29 00:00:00.00   3rd Qu.:2009-11-04 18:00:00.00  
##  Max.   :2014-06-30 00:00:00.00   Max.   :9999-07-01 00:00:00.00  
##  NA's   :1445                     NA's   :3971                    
##   ReceivedDate                      IsDenied         ClaimantAge_at_DOI
##  Min.   :1971-11-24 00:00:00.00   Length:16857       Min.   :-5973.00  
##  1st Qu.:1998-09-22 00:00:00.00   Class :character   1st Qu.:   36.00  
##  Median :2003-11-19 00:00:00.00   Mode  :character   Median :   45.00  
##  Mean   :2003-10-25 04:58:07.87                      Mean   :   43.29  
##  3rd Qu.:2008-11-12 00:00:00.00                      3rd Qu.:   53.00  
##  Max.   :4996-02-17 00:00:00.00                      Max.   :   87.00  
##                                                      NA's   :5366      
##     Gender          ClaimantType       InjuryNature       BodyPartRegion    
##  Length:16857       Length:16857       Length:16857       Length:16857      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    BodyPart         BillReviewALE         Hospital      PhysicianOutpatient
##  Length:16857       Min.   :    0.00   Min.   :     0   Min.   :   -18.8   
##  Class :character   1st Qu.:    8.25   1st Qu.:   187   1st Qu.:   105.0   
##  Mode  :character   Median :   24.00   Median :   536   Median :   216.9   
##                     Mean   :  179.20   Mean   :  4292   Mean   :  1662.5   
##                     3rd Qu.:   64.00   3rd Qu.:  2075   3rd Qu.:   656.3   
##                     Max.   :15836.77   Max.   :408665   Max.   :318039.6   
##                     NA's   :14035      NA's   :14200    NA's   :10226      
##        Rx            processing_time
##  Min.   :     0.00   Min.   :    7  
##  1st Qu.:    21.70   1st Qu.:  587  
##  Median :    56.38   Median : 1290  
##  Mean   :  1326.86   Mean   : 1598  
##  3rd Qu.:   177.70   3rd Qu.: 2382  
##  Max.   :187627.17   Max.   :10830  
##  NA's   :14338       NA's   :1445

Determinar y Limpiar Nueva Base de Datos

df_clus <- df_clus[, c("TotalPaid", "TotalRecovery", "IndemnityPaid", "OtherPaid", "AverageWeeklyWage", "ClaimantAge_at_DOI", "processing_time")]

summary(df_clus)
##    TotalPaid         TotalRecovery     IndemnityPaid        OtherPaid     
##  Min.   :    26.88   Min.   :    0.0   Min.   :    15.5   Min.   : -7820  
##  1st Qu.:  3330.54   1st Qu.:    0.0   1st Qu.:  1079.4   1st Qu.:  1413  
##  Median : 11847.95   Median :    0.0   Median :  4664.3   Median :  5614  
##  Mean   : 23907.09   Mean   :  307.4   Mean   : 11742.7   Mean   : 12164  
##  3rd Qu.: 31129.08   3rd Qu.:    0.0   3rd Qu.: 13089.5   3rd Qu.: 16237  
##  Max.   :149949.17   Max.   :85000.0   Max.   :147420.0   Max.   :137142  
##                                                                           
##  AverageWeeklyWage ClaimantAge_at_DOI processing_time
##  Min.   :    1.0   Min.   :-5973.00   Min.   :    7  
##  1st Qu.:  467.0   1st Qu.:   36.00   1st Qu.:  587  
##  Median :  592.9   Median :   45.00   Median : 1290  
##  Mean   :  592.9   Mean   :   43.29   Mean   : 1598  
##  3rd Qu.:  638.4   3rd Qu.:   53.00   3rd Qu.: 2382  
##  Max.   :15062.3   Max.   :   87.00   Max.   :10830  
##                    NA's   :5366       NA's   :1445
df_clus <- na.omit(df_clus)
sapply(df_clus, function(x) sum(is.na(x)))
##          TotalPaid      TotalRecovery      IndemnityPaid          OtherPaid 
##                  0                  0                  0                  0 
##  AverageWeeklyWage ClaimantAge_at_DOI    processing_time 
##                  0                  0                  0
#Borramos dos filas debido al error que apareció en el siguiente chunk
#Error in data.frame(..., check.names = FALSE) : 
#arguments imply differing number of rows: 3768, 10046
#df_clus <- df_clus[-3768, ]
#df_clus <- df_clus[-10044, ]

Crear Clusters

#O. normalizar variables
#df_clus <- as.data.frame(scale(df_clus))

# 1. Crear base de datos
df_clus
## # A tibble: 10,046 × 7
##    TotalPaid TotalRecovery IndemnityPaid OtherPaid AverageWeeklyWage
##        <dbl>         <dbl>         <dbl>     <dbl>             <dbl>
##  1    11948.             0          244.    11704.              640.
##  2    43108.             0        40000      3108.              539 
##  3     1990.             0          605.     1385.              756.
##  4     5407.             0         2246.     3161.              328.
##  5    32606.             0        20000     12606.              512.
##  6      633.             0          365.      268.              767.
##  7   114251.             0        44271.    69980.              431.
##  8    21020.             0         9344.    11676.              798.
##  9    78853.             0        35239.    43614.              445.
## 10     3001.             0         2020.      981.              421.
## # ℹ 10,036 more rows
## # ℹ 2 more variables: ClaimantAge_at_DOI <dbl>, processing_time <dbl>
# 2. Determinar el número de grupos
grupos <- 8

# 3. Realizar la clasificación
segmentos <- kmeans(df_clus,grupos)


# 4. Revisar la asignación de grupos
asignacion <- cbind(df_clus, cluster=segmentos$cluster)


# 5. Graficar asignaciones
# install.packages("ggplot2")
library(ggplot2)
# install.packages("factoextra")
library(factoextra)

fviz_cluster(segmentos,data=df_clus)

Optimización de Clusters

# 6. Optimizar la cantidad de grupos

#Se trababa al hacer la optimización pero el código era el siguiente:

#set.seed(123)
#optimizacion <- clusGap(df_clus, FUN = kmeans, nstart = 1, K.max = 5)
#plot(optimizacion, xlab = "Número de clusters K")

# El punto más alto de la gráfica indica la cantidad de grupos óptimo