Contexto

1. Importar y juntar bases de datos

# file.choose()
bd1 <- read.csv("C:\\Users\\ximen\\Downloads\\ClaimsData2018.csv")
bd2 <- read.csv("C:\\Users\\ximen\\Downloads\\TransactionsSummary.csv")
bd <- merge(bd1, bd2, by="ClaimID",all=TRUE)

2. Crear nueva columna para Total Incurred Cost

# install.packages("dplyr")
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
bd <- bd %>% 
  mutate(Total_Incurred_Cost_Claim = TotalReserves + IndemnityPaid + OtherPaid - TotalRecovery)

3. Filtrar base de datos por sólo mujeres

bd_mujeres1 <- subset(bd, Gender == "Female")
# View(bd_mujeres1)

4. Eliminar columas de X

library(dplyr)
bd_mujeres1 <- bd_mujeres1 %>%
  select(-X:-X.22)
# View(bd_mujeres1)

5. Descargar base de datos limpia como CSV

write.csv(bd_mujeres1, "bd_mujeres limpia.csv", row.names=FALSE)
summary(bd_mujeres1)
##     ClaimID           TotalPaid         TotalReserves     TotalRecovery     
##  Min.   :  650919   Min.   :    -81.8   Min.   :      0   Min.   :    0.00  
##  1st Qu.:  806228   1st Qu.:     20.1   1st Qu.:      0   1st Qu.:    0.00  
##  Median :  833851   Median :    223.0   Median :      0   Median :    0.00  
##  Mean   : 8053898   Mean   :   6504.3   Mean   :   2423   Mean   :   31.13  
##  3rd Qu.: 7143280   3rd Qu.:    932.1   3rd Qu.:      0   3rd Qu.:    0.00  
##  Max.   :62203889   Max.   :2985247.9   Max.   :2069575   Max.   :90357.52  
##                                                                             
##  IndemnityPaid        OtherPaid         ClaimStatus        IncidentDate      
##  Min.   :    -1.2   Min.   :    -81.8   Length:59197       Length:59197      
##  1st Qu.:     0.0   1st Qu.:     16.4   Class :character   Class :character  
##  Median :     0.0   Median :    218.7   Mode  :character   Mode  :character  
##  Mean   :  2945.2   Mean   :   3559.1                                        
##  3rd Qu.:     0.0   3rd Qu.:    857.8                                        
##  Max.   :492934.8   Max.   :2700073.4                                        
##                                                                              
##  IncidentDescription ReturnToWorkDate   AverageWeeklyWage  ClaimantOpenedDate
##  Length:59197        Length:59197       Length:59197       Length:59197      
##  Class :character    Class :character   Class :character   Class :character  
##  Mode  :character    Mode  :character   Mode  :character   Mode  :character  
##                                                                              
##                                                                              
##                                                                              
##                                                                              
##  ClaimantClosedDate EmployerNotificationDate ReceivedDate      
##  Length:59197       Length:59197             Length:59197      
##  Class :character   Class :character         Class :character  
##  Mode  :character   Mode  :character         Mode  :character  
##                                                                
##                                                                
##                                                                
##                                                                
##     IsDenied       ClaimantAge_at_DOI    Gender          ClaimantType      
##  Min.   :0.00000   Length:59197       Length:59197       Length:59197      
##  1st Qu.:0.00000   Class :character   Class :character   Class :character  
##  Median :0.00000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.05688                                                           
##  3rd Qu.:0.00000                                                           
##  Max.   :1.00000                                                           
##                                                                            
##  InjuryNature       BodyPartRegion       BodyPart         BillReviewALE     
##  Length:59197       Length:59197       Length:59197       Min.   : -456.00  
##  Class :character   Class :character   Class :character   1st Qu.:    8.25  
##  Mode  :character   Mode  :character   Mode  :character   Median :   24.00  
##                                                           Mean   :  174.80  
##                                                           3rd Qu.:   64.00  
##                                                           Max.   :20730.77  
##                                                           NA's   :46628     
##     Hospital        PhysicianOutpatient       Rx          
##  Min.   :-12570.4   Min.   :   -162.9   Min.   :  -160.7  
##  1st Qu.:   203.1   1st Qu.:    106.8   1st Qu.:    23.4  
##  Median :   572.9   Median :    220.2   Median :    61.1  
##  Mean   :  4580.8   Mean   :   1700.4   Mean   :  1357.1  
##  3rd Qu.:  2213.5   3rd Qu.:    667.2   3rd Qu.:   176.5  
##  Max.   :667973.0   Max.   :1481468.5   Max.   :380924.3  
##  NA's   :49187      NA's   :34369       NA's   :49906     
##  Total_Incurred_Cost_Claim
##  Min.   :  -2961          
##  1st Qu.:     22          
##  Median :    226          
##  Mean   :   8897          
##  3rd Qu.:    976          
##  Max.   :5054823          
## 

6. Entender la base de datos

summary(bd_mujeres1)
##     ClaimID           TotalPaid         TotalReserves     TotalRecovery     
##  Min.   :  650919   Min.   :    -81.8   Min.   :      0   Min.   :    0.00  
##  1st Qu.:  806228   1st Qu.:     20.1   1st Qu.:      0   1st Qu.:    0.00  
##  Median :  833851   Median :    223.0   Median :      0   Median :    0.00  
##  Mean   : 8053898   Mean   :   6504.3   Mean   :   2423   Mean   :   31.13  
##  3rd Qu.: 7143280   3rd Qu.:    932.1   3rd Qu.:      0   3rd Qu.:    0.00  
##  Max.   :62203889   Max.   :2985247.9   Max.   :2069575   Max.   :90357.52  
##                                                                             
##  IndemnityPaid        OtherPaid         ClaimStatus        IncidentDate      
##  Min.   :    -1.2   Min.   :    -81.8   Length:59197       Length:59197      
##  1st Qu.:     0.0   1st Qu.:     16.4   Class :character   Class :character  
##  Median :     0.0   Median :    218.7   Mode  :character   Mode  :character  
##  Mean   :  2945.2   Mean   :   3559.1                                        
##  3rd Qu.:     0.0   3rd Qu.:    857.8                                        
##  Max.   :492934.8   Max.   :2700073.4                                        
##                                                                              
##  IncidentDescription ReturnToWorkDate   AverageWeeklyWage  ClaimantOpenedDate
##  Length:59197        Length:59197       Length:59197       Length:59197      
##  Class :character    Class :character   Class :character   Class :character  
##  Mode  :character    Mode  :character   Mode  :character   Mode  :character  
##                                                                              
##                                                                              
##                                                                              
##                                                                              
##  ClaimantClosedDate EmployerNotificationDate ReceivedDate      
##  Length:59197       Length:59197             Length:59197      
##  Class :character   Class :character         Class :character  
##  Mode  :character   Mode  :character         Mode  :character  
##                                                                
##                                                                
##                                                                
##                                                                
##     IsDenied       ClaimantAge_at_DOI    Gender          ClaimantType      
##  Min.   :0.00000   Length:59197       Length:59197       Length:59197      
##  1st Qu.:0.00000   Class :character   Class :character   Class :character  
##  Median :0.00000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.05688                                                           
##  3rd Qu.:0.00000                                                           
##  Max.   :1.00000                                                           
##                                                                            
##  InjuryNature       BodyPartRegion       BodyPart         BillReviewALE     
##  Length:59197       Length:59197       Length:59197       Min.   : -456.00  
##  Class :character   Class :character   Class :character   1st Qu.:    8.25  
##  Mode  :character   Mode  :character   Mode  :character   Median :   24.00  
##                                                           Mean   :  174.80  
##                                                           3rd Qu.:   64.00  
##                                                           Max.   :20730.77  
##                                                           NA's   :46628     
##     Hospital        PhysicianOutpatient       Rx          
##  Min.   :-12570.4   Min.   :   -162.9   Min.   :  -160.7  
##  1st Qu.:   203.1   1st Qu.:    106.8   1st Qu.:    23.4  
##  Median :   572.9   Median :    220.2   Median :    61.1  
##  Mean   :  4580.8   Mean   :   1700.4   Mean   :  1357.1  
##  3rd Qu.:  2213.5   3rd Qu.:    667.2   3rd Qu.:   176.5  
##  Max.   :667973.0   Max.   :1481468.5   Max.   :380924.3  
##  NA's   :49187      NA's   :34369       NA's   :49906     
##  Total_Incurred_Cost_Claim
##  Min.   :  -2961          
##  1st Qu.:     22          
##  Median :    226          
##  Mean   :   8897          
##  3rd Qu.:    976          
##  Max.   :5054823          
## 
#count(bd_mujeres1, ClaimStatus, sort= TRUE)
#count(bd_mujeres1, IncidentDate, sort= TRUE)
#count(bd_mujeres1, IncidentDescription, sort= TRUE)
#count(bd_mujeres1, ReturnToWorkDate, sort= TRUE)
#count(bd_mujeres1, AverageWeeklyWage, sort= TRUE)
#count(bd_mujeres1, ClaimantOpenedDate, sort= TRUE)
#count(bd_mujeres1, ClaimantClosedDate, sort= TRUE)
#count(bd_mujeres1, EmployerNotificationDate, sort= TRUE)
#count(bd_mujeres1, ReceivedDate, sort= TRUE)
#count(bd_mujeres1, ClaimantAge_at_DOI, sort= TRUE)
#count(bd_mujeres1, Gender, sort= TRUE)
#count(bd_mujeres1, ClaimantType, sort= TRUE)
#count(bd_mujeres1, InjuryNature, sort= TRUE)
#count(bd_mujeres1, BodyPartRegion, sort= TRUE)
#count(bd_mujeres1, BodyPart, sort= TRUE)

Observaciones:
1. Tenemos muchos “#VALUE!” en la variable de Return to Work Date
2. Muchos NULL en Average Weekly Wage
3. Tenemos muchos “#VALUE!” en la variable de EmployerNotificationDate
4. Muchos NULL en ClaimantAGE_at_DOI

# Extraer las variables de interés
Mujeres <- bd_mujeres1[,c("ClaimStatus","ClaimantType","BodyPartRegion","InjuryNature")]

Regresión Lineal

1. Importar base de datos limpia

#file.choose()
bd_mujereslimpia <- read.csv("C:\\Users\\ximen\\Downloads\\bd_mujeres limpia.csv")

2. Identificar las variables de interés

library(dplyr)
# Tener una base de datos con las columnas necesarias
bd_mujeresrl <- bd_mujereslimpia %>%
  select(ClaimID, ClaimStatus, BodyPartRegion, ClaimantAge_at_DOI, Gender, ClaimantType, ClaimantOpenedDate, ClaimantClosedDate, Total_Incurred_Cost_Claim)
# View(bd_mujeresrl)
summary(bd_mujeresrl)
##     ClaimID         ClaimStatus        BodyPartRegion     ClaimantAge_at_DOI
##  Min.   :  650919   Length:59197       Length:59197       Length:59197      
##  1st Qu.:  806228   Class :character   Class :character   Class :character  
##  Median :  833851   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 8053898                                                           
##  3rd Qu.: 7143280                                                           
##  Max.   :62203889                                                           
##     Gender          ClaimantType       ClaimantOpenedDate ClaimantClosedDate
##  Length:59197       Length:59197       Length:59197       Length:59197      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Total_Incurred_Cost_Claim
##  Min.   :  -2961          
##  1st Qu.:     22          
##  Median :    226          
##  Mean   :   8897          
##  3rd Qu.:    976          
##  Max.   :5054823

3. Conversión de fechas a días

bd_mujeresrl$ClaimantOpenedDate <- as.Date(bd_mujeresrl$ClaimantOpenedDate, format = "%m/%d/%y")
bd_mujeresrl$ClaimantClosedDate <- as.Date(bd_mujeresrl$ClaimantClosedDate, format = "%m/%d/%y")

# Calcular la diferencia en días entre las fechas
bd_mujeresrl$TiempoDeProcesamientoDias <- as.numeric(difftime(bd_mujeresrl$ClaimantClosedDate, bd_mujeresrl$ClaimantOpenedDate, units = "days"))

# Eliminar las columnas originales de fecha
bd_mujeresrl <- bd_mujeresrl[, !(names(bd_mujeresrl) %in% c("ClaimantOpenedDate", "ClaimantClosedDate"))]
# View(bd_mujeresAR)
summary(bd_mujeresrl)
##     ClaimID         ClaimStatus        BodyPartRegion     ClaimantAge_at_DOI
##  Min.   :  650919   Length:59197       Length:59197       Length:59197      
##  1st Qu.:  806228   Class :character   Class :character   Class :character  
##  Median :  833851   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 8053898                                                           
##  3rd Qu.: 7143280                                                           
##  Max.   :62203889                                                           
##                                                                             
##     Gender          ClaimantType       Total_Incurred_Cost_Claim
##  Length:59197       Length:59197       Min.   :  -2961          
##  Class :character   Class :character   1st Qu.:     22          
##  Mode  :character   Mode  :character   Median :    226          
##                                        Mean   :   8897          
##                                        3rd Qu.:    976          
##                                        Max.   :5054823          
##                                                                 
##  TiempoDeProcesamientoDias
##  Min.   :-333.0           
##  1st Qu.:   0.0           
##  Median : 245.0           
##  Mean   : 806.1           
##  3rd Qu.: 920.0           
##  Max.   :6912.0           
##  NA's   :54104

4. Conversión de carácter a números

bd_mujeresrl$ClaimStatus <- factor(bd_mujeresrl$ClaimStatus, levels = c("C", "O", "R"), labels = c(1, 2, 3))
bd_mujeresrl$ClaimantAge_at_DOI <- as.numeric(bd_mujeresrl$ClaimantAge_at_DOI)
## Warning: NAs introducidos por coerción
bd_mujeresrl$Gender <- as.numeric(factor(bd_mujeresrl$Gender, levels = c("Male", "Female", "Not Provided"), labels = c(1, 2, 3)))
bd_mujeresrl$ClaimantType <- as.numeric(factor(bd_mujeresrl$ClaimantType, levels = c("Medical Only", "Indemnity", "Report Only"), labels = c(1, 2, 3)))

5.Eliminar Na’s

bd <- na.omit(bd_mujeresrl)
summary(bd_mujeresrl)
##     ClaimID         ClaimStatus BodyPartRegion     ClaimantAge_at_DOI
##  Min.   :  650919   1:56900     Length:59197       Min.   :-8000.00  
##  1st Qu.:  806228   2: 1786     Class :character   1st Qu.:   33.00  
##  Median :  833851   3:  511     Mode  :character   Median :   43.00  
##  Mean   : 8053898                                  Mean   :   39.75  
##  3rd Qu.: 7143280                                  3rd Qu.:   52.00  
##  Max.   :62203889                                  Max.   :   89.00  
##                                                    NA's   :17097     
##      Gender   ClaimantType   Total_Incurred_Cost_Claim
##  Min.   :2   Min.   :1.000   Min.   :  -2961          
##  1st Qu.:2   1st Qu.:1.000   1st Qu.:     22          
##  Median :2   Median :1.000   Median :    226          
##  Mean   :2   Mean   :1.357   Mean   :   8897          
##  3rd Qu.:2   3rd Qu.:2.000   3rd Qu.:    976          
##  Max.   :2   Max.   :3.000   Max.   :5054823          
##                                                       
##  TiempoDeProcesamientoDias
##  Min.   :-333.0           
##  1st Qu.:   0.0           
##  Median : 245.0           
##  Mean   : 806.1           
##  3rd Qu.: 920.0           
##  Max.   :6912.0           
##  NA's   :54104

5.Eliminar valores negativos

# Eliminar valores negativos en ClaimantAge_at_DOI
bd_mujeresrl <- bd_mujeresrl %>%
  filter(ClaimantAge_at_DOI >= 0)
# Eliminar valores negativos en Total_Incurred_Cost_Claim
bd_mujeresrl <- bd_mujeresrl %>%
  filter(Total_Incurred_Cost_Claim >= 0)
# Eliminar valores negativos en TiempoDeProcesamientoDías
bd_mujeresrl <- bd_mujeresrl %>%
  filter(TiempoDeProcesamientoDias >= 0)

summary(bd_mujeresrl)
##     ClaimID         ClaimStatus BodyPartRegion     ClaimantAge_at_DOI
##  Min.   :  650919   1:4015      Length:4018        Min.   : 1.00     
##  1st Qu.:  823404   2:   3      Class :character   1st Qu.:34.00     
##  Median : 5970814   3:   0      Mode  :character   Median :44.00     
##  Mean   :15622363                                  Mean   :43.14     
##  3rd Qu.:30288888                                  3rd Qu.:52.00     
##  Max.   :61592860                                  Max.   :87.00     
##      Gender   ClaimantType  Total_Incurred_Cost_Claim TiempoDeProcesamientoDias
##  Min.   :2   Min.   :1.00   Min.   :     0.0          Min.   :   0.0           
##  1st Qu.:2   1st Qu.:1.00   1st Qu.:     0.0          1st Qu.:  33.0           
##  Median :2   Median :1.00   Median :   171.3          Median : 301.0           
##  Mean   :2   Mean   :1.68   Mean   :  4459.3          Mean   : 717.1           
##  3rd Qu.:2   3rd Qu.:2.00   3rd Qu.:  1125.3          3rd Qu.: 844.0           
##  Max.   :2   Max.   :3.00   Max.   :388620.8          Max.   :6912.0

6.Regresión Lineal

regresion <- lm(Total_Incurred_Cost_Claim ~ ClaimID + ClaimantAge_at_DOI + BodyPartRegion + TiempoDeProcesamientoDias, 
                data = bd_mujeresrl)
summary(regresion)
## 
## Call:
## lm(formula = Total_Incurred_Cost_Claim ~ ClaimID + ClaimantAge_at_DOI + 
##     BodyPartRegion + TiempoDeProcesamientoDias, data = bd_mujeresrl)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -23086  -4840  -2286    -14 363016 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -4.842e+03  1.676e+03  -2.889  0.00388 ** 
## ClaimID                           -5.826e-05  2.034e-05  -2.864  0.00421 ** 
## ClaimantAge_at_DOI                 1.269e+02  2.770e+01   4.580 4.78e-06 ***
## BodyPartRegionLower Extremities    3.031e+03  1.231e+03   2.461  0.01389 *  
## BodyPartRegionMultiple Body Parts  1.952e+03  1.403e+03   1.391  0.16417    
## BodyPartRegionNeck                 3.727e+03  1.893e+03   1.969  0.04903 *  
## BodyPartRegionNon-Standard Code   -8.554e+02  2.703e+03  -0.316  0.75166    
## BodyPartRegionNot Available       -3.580e+00  2.056e+04   0.000  0.99986    
## BodyPartRegionTrunk                3.408e+03  1.336e+03   2.551  0.01076 *  
## BodyPartRegionUpper Extremities    1.794e+03  1.170e+03   1.533  0.12547    
## TiempoDeProcesamientoDias          3.543e+00  3.343e-01  10.599  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20530 on 4007 degrees of freedom
## Multiple R-squared:  0.04572,    Adjusted R-squared:  0.04334 
## F-statistic:  19.2 on 10 and 4007 DF,  p-value: < 2.2e-16

7. Ajuste de variables para aumentar R

regresion <- lm(Total_Incurred_Cost_Claim ~ ClaimantAge_at_DOI + BodyPartRegion + TiempoDeProcesamientoDias, 
                data = bd_mujeresrl)
summary(regresion)
## 
## Call:
## lm(formula = Total_Incurred_Cost_Claim ~ ClaimantAge_at_DOI + 
##     BodyPartRegion + TiempoDeProcesamientoDias, data = bd_mujeresrl)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25854  -4494  -2357   -250 362038 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -6264.9460  1602.2526  -3.910 9.38e-05 ***
## ClaimantAge_at_DOI                  126.0115    27.7233   4.545 5.65e-06 ***
## BodyPartRegionLower Extremities    3348.0154  1227.5452   2.727  0.00641 ** 
## BodyPartRegionMultiple Body Parts  2561.3990  1387.9538   1.845  0.06505 .  
## BodyPartRegionNeck                 3727.3213  1894.4594   1.967  0.04920 *  
## BodyPartRegionNon-Standard Code   -1174.6409  2702.9742  -0.435  0.66390    
## BodyPartRegionNot Available        -665.6847 20574.4218  -0.032  0.97419    
## BodyPartRegionTrunk                3846.9340  1328.2441   2.896  0.00380 ** 
## BodyPartRegionUpper Extremities    2010.1022  1168.9276   1.720  0.08558 .  
## TiempoDeProcesamientoDias             3.9175     0.3079  12.724  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20550 on 4008 degrees of freedom
## Multiple R-squared:  0.04377,    Adjusted R-squared:  0.04162 
## F-statistic: 20.38 on 9 and 4008 DF,  p-value: < 2.2e-16

8.Modelo Predictivo

datos <- data.frame(ClaimantAge_at_DOI = 43.14, TiempoDeProcesamientoDias = 717.1, BodyPartRegion = "Lower Extremities")
predict(regresion, datos)
##        1 
## 5328.477

9.Regresión Lineal Ajustada

regresion <- lm(TiempoDeProcesamientoDias ~ ClaimantAge_at_DOI + ClaimantType + Total_Incurred_Cost_Claim, 
                data = bd_mujeresrl)
summary(regresion)
## 
## Call:
## lm(formula = TiempoDeProcesamientoDias ~ ClaimantAge_at_DOI + 
##     ClaimantType + Total_Incurred_Cost_Claim, data = bd_mujeresrl)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2719.3  -569.3  -289.7    48.3  6332.3 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                1.716e+03  6.655e+01  25.789   <2e-16 ***
## ClaimantAge_at_DOI        -1.229e+01  1.351e+00  -9.099   <2e-16 ***
## ClaimantType              -3.075e+02  1.961e+01 -15.682   <2e-16 ***
## Total_Incurred_Cost_Claim  1.071e-02  7.620e-04  14.061   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1011 on 4014 degrees of freedom
## Multiple R-squared:  0.1133, Adjusted R-squared:  0.1126 
## F-statistic:   171 on 3 and 4014 DF,  p-value: < 2.2e-16

10.Conclusión

Clústers

1.Identificar Outliers

boxplot(bd_mujeresrl$Total_Incurred_Cost_Claim, horizontal = TRUE)

boxplot(bd_mujeresrl$TiempoDeProcesamientoDias, horizontal = TRUE)

2. Eliminar Outliers

# Crear una nueva base de datos con las columnas deseadas
bd_mujerescl <- bd_mujeresrl[, c("ClaimID", "TiempoDeProcesamientoDias", "Total_Incurred_Cost_Claim")]

# Llamar a los renglones como ClaimID
rownames(bd_mujerescl) <- bd_mujerescl$ClaimID
bd_mujerescl <- subset(bd_mujerescl, select = -c(ClaimID))
# View(bd_mujeresCL)

# Columna de TiempoDeProcesamientoDias
IQR_TiempoDeProcesamientoDias <- IQR(bd_mujerescl$TiempoDeProcesamientoDias)
IQR_TiempoDeProcesamientoDias
## [1] 811
summary(bd_mujerescl)
##  TiempoDeProcesamientoDias Total_Incurred_Cost_Claim
##  Min.   :   0.0            Min.   :     0.0         
##  1st Qu.:  33.0            1st Qu.:     0.0         
##  Median : 301.0            Median :   171.3         
##  Mean   : 717.1            Mean   :  4459.3         
##  3rd Qu.: 844.0            3rd Qu.:  1125.3         
##  Max.   :6912.0            Max.   :388620.8
LI_TiempoDeProcesamientoDias <- 33 - 1.5*IQR_TiempoDeProcesamientoDias
LI_TiempoDeProcesamientoDias
## [1] -1183.5
LS_TiempoDeProcesamientoDias <- 844 + 1.5*IQR_TiempoDeProcesamientoDias
LS_TiempoDeProcesamientoDias
## [1] 2060.5
cat("LI_TiempoDeProcesamientoDias:", LI_TiempoDeProcesamientoDias, "\n")
## LI_TiempoDeProcesamientoDias: -1183.5
cat("LS_TiempoDeProcesamientoDias:", LS_TiempoDeProcesamientoDias, "\n")
## LS_TiempoDeProcesamientoDias: 2060.5
bd_mujerescl <- bd_mujerescl[bd_mujerescl$TiempoDeProcesamientoDias <= 2061, ]
### Nota: se redondeó a 2061 porque el LS dió un resultado de 2060.5.

#Columna de Total_Incurred_Cost_Claim
IQR_Total_Incurred_Cost_Claim <- IQR(bd_mujerescl$Total_Incurred_Cost_Claim)
IQR_Total_Incurred_Cost_Claim
## [1] 1155.71
summary(bd_mujerescl)
##  TiempoDeProcesamientoDias Total_Incurred_Cost_Claim
##  Min.   :   0.0            Min.   :     0.0         
##  1st Qu.:   4.0            1st Qu.:     0.0         
##  Median : 235.0            Median :   177.8         
##  Mean   : 398.6            Mean   :  3707.7         
##  3rd Qu.: 548.0            3rd Qu.:  1155.7         
##  Max.   :2058.0            Max.   :246847.9
LI_Total_Incurred_Cost_Claim <- 0 - 1.5*IQR_Total_Incurred_Cost_Claim
LI_Total_Incurred_Cost_Claim
## [1] -1733.565
LS_Total_Incurred_Cost_Claim <- 1125.3 + 1.5*IQR_Total_Incurred_Cost_Claim
LS_Total_Incurred_Cost_Claim
## [1] 2858.865
cat("LI_Total_Incurred_Cost_Claim:", LI_Total_Incurred_Cost_Claim, "\n")
## LI_Total_Incurred_Cost_Claim: -1733.565
cat("LS_Total_Incurred_Cost_Claim:", LS_Total_Incurred_Cost_Claim, "\n")
## LS_Total_Incurred_Cost_Claim: 2858.865
bd_mujerescl <- bd_mujerescl[bd_mujerescl$Total_Incurred_Cost_Claim <= 2859, ]
### Nota: se redondeó a 2859 porque el LS dió un resultado de 2858.865.
summary(bd_mujerescl)
##  TiempoDeProcesamientoDias Total_Incurred_Cost_Claim
##  Min.   :   0.0            Min.   :   0.00          
##  1st Qu.:   0.0            1st Qu.:   0.00          
##  Median : 175.0            Median :  98.62          
##  Mean   : 337.7            Mean   : 372.37          
##  3rd Qu.: 460.5            3rd Qu.: 428.27          
##  Max.   :2058.0            Max.   :2851.58

3. Crear grupos

# O. Normalizar variables
bd_mujerescl <- as.data.frame(scale(bd_mujerescl))

# 1. Crear base de datos
bdmujeresCLUSTER <- bd_mujerescl

# 2. Determinar el número de grupos
grupos <- 10

# 3. Realizar la clasificación
segmentos <- kmeans(bdmujeresCLUSTER,grupos)

# 4. Revisar la asignación de grupos
asignacion <- cbind(bdmujeresCLUSTER, cluster=segmentos$cluster)

# 5. Graficar asignaciones
# install.packages("ggplot2")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
# install.packages("factoextra")
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.1
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(segmentos, data = bdmujeresCLUSTER,
             ellipse.type = "euclid",
             star.plot = T,
             repel = T,
             ggtheme = theme())

4.Optimizar la cantidad de grupos creados

library(cluster)
library(data.table)
## Warning: package 'data.table' was built under R version 4.3.1
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
set.seed(123)
optimizacion <- clusGap(bdmujeresCLUSTER, FUN = kmeans, nstart = 1, K.max = 10)
## Warning: did not converge in 10 iterations
plot(optimizacion, xlab = "Número de clusters K")

# Árbol

#Extraer las variables de interés
mujeresarbol <- bd_mujeresrl[ ,c("Total_Incurred_Cost_Claim","ClaimStatus", "ClaimantAge_at_DOI","Gender","ClaimantType", "TiempoDeProcesamientoDias")]

sum(is.na(mujeresarbol))
## [1] 0
#Conversión de variables categóricas a factores
mujeresarbol$Total_Incurred_Cost_Claim <- as.numeric(mujeresarbol$Total_Incurred_Cost_Claim)
mujeresarbol$ClaimStatus <- as.factor(mujeresarbol$ClaimStatus)
mujeresarbol$ClaimantAge_at_DOI <- as.numeric(mujeresarbol$ClaimantAge_at_DOI)
mujeresarbol$Gender <- as.factor(mujeresarbol$Gender)
mujeresarbol$ClaimantType <- as.factor(mujeresarbol$ClaimantType)
mujeresarbol$TiempoDeProcesamientoDias <- as.numeric(mujeresarbol$TiempoDeProcesamientoDias)
str(mujeresarbol)
## 'data.frame':    4018 obs. of  6 variables:
##  $ Total_Incurred_Cost_Claim: num  43108 390.2 0 106.4 19.6 ...
##  $ ClaimStatus              : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ClaimantAge_at_DOI       : num  41 38 51 35 42 51 47 36 47 47 ...
##  $ Gender                   : Factor w/ 1 level "2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ClaimantType             : Factor w/ 3 levels "1","2","3": 2 1 1 1 1 1 1 2 1 1 ...
##  $ TiempoDeProcesamientoDias: num  848 2464 2856 2679 3256 ...
library(rpart)
## Warning: package 'rpart' was built under R version 4.3.1
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.1
arbol <- rpart(formula= TiempoDeProcesamientoDias ~ ., data= mujeresarbol)
arbol
## n= 4018 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 4018 4623308000  717.11470  
##    2) ClaimantType=3 904  330852900  129.85840  
##      4) Total_Incurred_Cost_Claim< 25 890  219013300   89.88764 *
##      5) Total_Incurred_Cost_Claim>=25 14   20024210 2670.85700 *
##    3) ClaimantType=1,2 3114 3890188000  887.59630  
##      6) Total_Incurred_Cost_Claim>=51.975 2443 2068233000  787.16950  
##       12) Total_Incurred_Cost_Claim< 208180.3 2430 1927711000  772.19140  
##         24) Total_Incurred_Cost_Claim< 12823.46 2155 1650736000  718.96470 *
##         25) Total_Incurred_Cost_Claim>=12823.46 275  223025800 1189.29500 *
##       13) Total_Incurred_Cost_Claim>=208180.3 13   38075070 3586.92300 *
##      7) Total_Incurred_Cost_Claim< 51.975 671 1707609000 1253.23400  
##       14) ClaimantAge_at_DOI>=48.5 196  247688700  726.56120 *
##       15) ClaimantAge_at_DOI< 48.5 475 1383120000 1470.55600 *
rpart.plot(arbol)

prp(arbol)

