# file.choose()
bd1 <- read.csv("/Users/dannaleal/Downloads/ClaimsData2018.csv")
bd2 <- read.csv("/Users/dannaleal/Downloads/TransactionsSummary2018.csv")
bd <- merge(bd1, bd2, by="ClaimID",all=TRUE)# install.packages("dplyr")
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
bd <- bd %>%
mutate(Total_Incurred_Cost_Claim = TotalReserves + IndemnityPaid + OtherPaid - TotalRecovery)bd_mujeres1 <- subset(bd, Gender == "Female")
# View(bd_mujeres1)library(dplyr)
bd_mujeres1 <- bd_mujeres1 %>%
select(-X:-X.22)# View(bd_mujeres1)write.csv(bd_mujeres1, "bd_mujeres limpia.csv", row.names = FALSE)# file.choose()
bd_mujereslimpia <- read.csv("/Users/dannaleal/Downloads/bd_mujeres limpia.csv")summary(bd_mujereslimpia)## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650919 Min. : -81.8 Min. : 0 Min. : 0.00
## 1st Qu.: 806228 1st Qu.: 20.1 1st Qu.: 0 1st Qu.: 0.00
## Median : 833851 Median : 223.0 Median : 0 Median : 0.00
## Mean : 8053898 Mean : 6504.3 Mean : 2423 Mean : 31.13
## 3rd Qu.: 7143280 3rd Qu.: 932.1 3rd Qu.: 0 3rd Qu.: 0.00
## Max. :62203889 Max. :2985247.9 Max. :2069575 Max. :90357.52
##
## IndemnityPaid OtherPaid ClaimStatus IncidentDate
## Min. : -1.2 Min. : -81.8 Length:59197 Length:59197
## 1st Qu.: 0.0 1st Qu.: 16.4 Class :character Class :character
## Median : 0.0 Median : 218.7 Mode :character Mode :character
## Mean : 2945.2 Mean : 3559.1
## 3rd Qu.: 0.0 3rd Qu.: 857.8
## Max. :492934.8 Max. :2700073.4
##
## IncidentDescription ReturnToWorkDate AverageWeeklyWage ClaimantOpenedDate
## Length:59197 Length:59197 Length:59197 Length:59197
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## ClaimantClosedDate EmployerNotificationDate ReceivedDate
## Length:59197 Length:59197 Length:59197
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## IsDenied ClaimantAge_at_DOI Gender ClaimantType
## Min. :0.00000 Length:59197 Length:59197 Length:59197
## 1st Qu.:0.00000 Class :character Class :character Class :character
## Median :0.00000 Mode :character Mode :character Mode :character
## Mean :0.05688
## 3rd Qu.:0.00000
## Max. :1.00000
##
## InjuryNature BodyPartRegion BodyPart BillReviewALE
## Length:59197 Length:59197 Length:59197 Min. : -456.00
## Class :character Class :character Class :character 1st Qu.: 8.25
## Mode :character Mode :character Mode :character Median : 24.00
## Mean : 174.80
## 3rd Qu.: 64.00
## Max. :20730.77
## NA's :46628
## Hospital PhysicianOutpatient Rx
## Min. :-12570.4 Min. : -162.9 Min. : -160.7
## 1st Qu.: 203.1 1st Qu.: 106.8 1st Qu.: 23.4
## Median : 572.9 Median : 220.2 Median : 61.1
## Mean : 4580.8 Mean : 1700.4 Mean : 1357.1
## 3rd Qu.: 2213.5 3rd Qu.: 667.2 3rd Qu.: 176.5
## Max. :667973.0 Max. :1481468.5 Max. :380924.3
## NA's :49187 NA's :34369 NA's :49906
## Total_Incurred_Cost_Claim
## Min. : -2961
## 1st Qu.: 22
## Median : 226
## Mean : 8897
## 3rd Qu.: 976
## Max. :5054823
##
library(dplyr)
# Crear una nueva base de datos con las columnas deseadas
bd_mujeresAR <- bd_mujereslimpia %>%
select(ClaimID, ClaimStatus, ClaimantAge_at_DOI, Gender, ClaimantType, ClaimantOpenedDate, ClaimantClosedDate, Total_Incurred_Cost_Claim)
# View(bd_mujeresAR)bd_mujeresAR$ClaimantOpenedDate <- as.Date(bd_mujeresAR$ClaimantOpenedDate, format = "%m/%d/%y")
bd_mujeresAR$ClaimantClosedDate <- as.Date(bd_mujeresAR$ClaimantClosedDate, format = "%m/%d/%y")
# Calcular la diferencia en días entre las fechas
bd_mujeresAR$TiempoDeProcesamientoDias <- as.numeric(difftime(bd_mujeresAR$ClaimantClosedDate, bd_mujeresAR$ClaimantOpenedDate, units = "days"))
# Eliminar las columnas originales de fecha
bd_mujeresAR <- bd_mujeresAR[, !(names(bd_mujeresAR) %in% c("ClaimantOpenedDate", "ClaimantClosedDate"))]
# View(bd_mujeresAR)summary(bd_mujeresAR)## ClaimID ClaimStatus ClaimantAge_at_DOI Gender
## Min. : 650919 Length:59197 Length:59197 Length:59197
## 1st Qu.: 806228 Class :character Class :character Class :character
## Median : 833851 Mode :character Mode :character Mode :character
## Mean : 8053898
## 3rd Qu.: 7143280
## Max. :62203889
##
## ClaimantType Total_Incurred_Cost_Claim TiempoDeProcesamientoDias
## Length:59197 Min. : -2961 Min. :-333.0
## Class :character 1st Qu.: 22 1st Qu.: 0.0
## Mode :character Median : 226 Median : 245.0
## Mean : 8897 Mean : 806.1
## 3rd Qu.: 976 3rd Qu.: 920.0
## Max. :5054823 Max. :6912.0
## NA's :54104
bd_mujeresAR$ClaimStatus <- factor(bd_mujeresAR$ClaimStatus, levels = c("C", "O", "R"), labels = c(1, 2, 3))
bd_mujeresAR$ClaimantAge_at_DOI <- as.numeric(bd_mujeresAR$ClaimantAge_at_DOI)## Warning: NAs introduced by coercion
bd_mujeresAR$Gender <- as.numeric(factor(bd_mujeresAR$Gender, levels = c("Male", "Female", "Not Provided"), labels = c(1, 2, 3)))
bd_mujeresAR$ClaimantType <- as.numeric(factor(bd_mujeresAR$ClaimantType, levels = c("Medical Only", "Indemnity", "Report Only"), labels = c(1, 2, 3)))
# View(bd_mujeresAR)bd <- na.omit(bd_mujeresAR)
# View(bd_mujeresAR)summary(bd_mujeresAR)## ClaimID ClaimStatus ClaimantAge_at_DOI Gender ClaimantType
## Min. : 650919 1:56900 Min. :-8000.00 Min. :2 Min. :1.000
## 1st Qu.: 806228 2: 1786 1st Qu.: 33.00 1st Qu.:2 1st Qu.:1.000
## Median : 833851 3: 511 Median : 43.00 Median :2 Median :1.000
## Mean : 8053898 Mean : 39.75 Mean :2 Mean :1.357
## 3rd Qu.: 7143280 3rd Qu.: 52.00 3rd Qu.:2 3rd Qu.:2.000
## Max. :62203889 Max. : 89.00 Max. :2 Max. :3.000
## NA's :17097
## Total_Incurred_Cost_Claim TiempoDeProcesamientoDias
## Min. : -2961 Min. :-333.0
## 1st Qu.: 22 1st Qu.: 0.0
## Median : 226 Median : 245.0
## Mean : 8897 Mean : 806.1
## 3rd Qu.: 976 3rd Qu.: 920.0
## Max. :5054823 Max. :6912.0
## NA's :54104
# Eliminar valores negativos en ClaimantAge_at_DOI
bd_mujeresAR <- bd_mujeresAR %>%
filter(ClaimantAge_at_DOI >= 0)
# Eliminar valores negativos en Total_Incurred_Cost_Claim
bd_mujeresAR <- bd_mujeresAR %>%
filter(Total_Incurred_Cost_Claim >= 0)
# Eliminar valores negativos en TiempoDeProcesamientoDías
bd_mujeresAR <- bd_mujeresAR %>%
filter(TiempoDeProcesamientoDias >= 0)
# View(bd_mujeresAR)
summary(bd_mujeresAR)## ClaimID ClaimStatus ClaimantAge_at_DOI Gender ClaimantType
## Min. : 650919 1:4015 Min. : 1.00 Min. :2 Min. :1.00
## 1st Qu.: 823404 2: 3 1st Qu.:34.00 1st Qu.:2 1st Qu.:1.00
## Median : 5970814 3: 0 Median :44.00 Median :2 Median :1.00
## Mean :15622363 Mean :43.14 Mean :2 Mean :1.68
## 3rd Qu.:30288888 3rd Qu.:52.00 3rd Qu.:2 3rd Qu.:2.00
## Max. :61592860 Max. :87.00 Max. :2 Max. :3.00
## Total_Incurred_Cost_Claim TiempoDeProcesamientoDias
## Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 33.0
## Median : 171.3 Median : 301.0
## Mean : 4459.3 Mean : 717.1
## 3rd Qu.: 1125.3 3rd Qu.: 844.0
## Max. :388620.8 Max. :6912.0
regresion <- lm(Total_Incurred_Cost_Claim ~ ClaimStatus + ClaimantAge_at_DOI +
Gender + ClaimantType + TiempoDeProcesamientoDias,
data = bd_mujeresAR)
summary(regresion)##
## Call:
## lm(formula = Total_Incurred_Cost_Claim ~ ClaimStatus + ClaimantAge_at_DOI +
## Gender + ClaimantType + TiempoDeProcesamientoDias, data = bd_mujeresAR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28757 -5382 -2119 625 359883
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8599.8188 1447.1614 -5.943 3.04e-09 ***
## ClaimStatus2 -145.6059 11808.9879 -0.012 0.99
## ClaimantAge_at_DOI 117.6975 27.5373 4.274 1.96e-05 ***
## Gender NA NA NA NA
## ClaimantType 2881.0214 406.1079 7.094 1.53e-12 ***
## TiempoDeProcesamientoDias 4.3814 0.3117 14.055 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20440 on 4013 degrees of freedom
## Multiple R-squared: 0.0524, Adjusted R-squared: 0.05145
## F-statistic: 55.48 on 4 and 4013 DF, p-value: < 2.2e-16
regresion <- lm(Total_Incurred_Cost_Claim ~ ClaimantAge_at_DOI + ClaimantType + TiempoDeProcesamientoDias,
data = bd_mujeresAR)
summary(regresion)##
## Call:
## lm(formula = Total_Incurred_Cost_Claim ~ ClaimantAge_at_DOI +
## ClaimantType + TiempoDeProcesamientoDias, data = bd_mujeresAR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28757 -5382 -2119 625 359883
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8600.2016 1446.6483 -5.945 3.00e-09 ***
## ClaimantAge_at_DOI 117.6993 27.5335 4.275 1.96e-05 ***
## ClaimantType 2881.1015 406.0053 7.096 1.51e-12 ***
## TiempoDeProcesamientoDias 4.3815 0.3116 14.061 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20440 on 4014 degrees of freedom
## Multiple R-squared: 0.0524, Adjusted R-squared: 0.05169
## F-statistic: 73.99 on 3 and 4014 DF, p-value: < 2.2e-16
datos <- data.frame(ClaimantAge_at_DOI = 43.14, ClaimantType = 1.68,
TiempoDeProcesamientoDias = 717.1)
predict(regresion, datos)## 1
## 4459.569
regresion <- lm(TiempoDeProcesamientoDias ~ ClaimStatus + ClaimantAge_at_DOI +
Gender + ClaimantType + Total_Incurred_Cost_Claim,
data = bd_mujeresAR)
summary(regresion)##
## Call:
## lm(formula = TiempoDeProcesamientoDias ~ ClaimStatus + ClaimantAge_at_DOI +
## Gender + ClaimantType + Total_Incurred_Cost_Claim, data = bd_mujeresAR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2718.6 -569.1 -289.9 50.1 6331.8
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.717e+03 6.655e+01 25.808 <2e-16 ***
## ClaimStatus2 -7.947e+02 5.837e+02 -1.362 0.173
## ClaimantAge_at_DOI -1.229e+01 1.351e+00 -9.103 <2e-16 ***
## Gender NA NA NA NA
## ClaimantType -3.078e+02 1.961e+01 -15.697 <2e-16 ***
## Total_Incurred_Cost_Claim 1.071e-02 7.619e-04 14.055 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1010 on 4013 degrees of freedom
## Multiple R-squared: 0.1137, Adjusted R-squared: 0.1128
## F-statistic: 128.7 on 4 and 4013 DF, p-value: < 2.2e-16
regresion <- lm(TiempoDeProcesamientoDias ~ ClaimantAge_at_DOI + ClaimantType + Total_Incurred_Cost_Claim,
data = bd_mujeresAR)
summary(regresion)##
## Call:
## lm(formula = TiempoDeProcesamientoDias ~ ClaimantAge_at_DOI +
## ClaimantType + Total_Incurred_Cost_Claim, data = bd_mujeresAR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2719.3 -569.3 -289.7 48.3 6332.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.716e+03 6.655e+01 25.789 <2e-16 ***
## ClaimantAge_at_DOI -1.229e+01 1.351e+00 -9.099 <2e-16 ***
## ClaimantType -3.075e+02 1.961e+01 -15.682 <2e-16 ***
## Total_Incurred_Cost_Claim 1.071e-02 7.620e-04 14.061 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1011 on 4014 degrees of freedom
## Multiple R-squared: 0.1133, Adjusted R-squared: 0.1126
## F-statistic: 171 on 3 and 4014 DF, p-value: < 2.2e-16
datos <- data.frame(ClaimantAge_at_DOI = 43.14, ClaimantType = 1.68,
Total_Incurred_Cost_Claim = 4459.3)
predict(regresion, datos)## 1
## 717.0795
Como se puede observar, en el segundo análisis de regresión se obtuvo una R cuadrada ajustada del 11.26%, mientras que en la primera se obtuvo una R cuadrada ajustada del 5.16%. Esto nos indica que el segundo análisis de regresión se ajusta mejor al modelo de los datos.
summary (bd_mujeresAR)## ClaimID ClaimStatus ClaimantAge_at_DOI Gender ClaimantType
## Min. : 650919 1:4015 Min. : 1.00 Min. :2 Min. :1.00
## 1st Qu.: 823404 2: 3 1st Qu.:34.00 1st Qu.:2 1st Qu.:1.00
## Median : 5970814 3: 0 Median :44.00 Median :2 Median :1.00
## Mean :15622363 Mean :43.14 Mean :2 Mean :1.68
## 3rd Qu.:30288888 3rd Qu.:52.00 3rd Qu.:2 3rd Qu.:2.00
## Max. :61592860 Max. :87.00 Max. :2 Max. :3.00
## Total_Incurred_Cost_Claim TiempoDeProcesamientoDias
## Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 33.0
## Median : 171.3 Median : 301.0
## Mean : 4459.3 Mean : 717.1
## 3rd Qu.: 1125.3 3rd Qu.: 844.0
## Max. :388620.8 Max. :6912.0
boxplot(bd_mujeresAR$Total_Incurred_Cost_Claim, horizontal = TRUE)boxplot(bd_mujeresAR$TiempoDeProcesamientoDias, horizontal = TRUE)# Crear una nueva base de datos con las columnas deseadas
bd_mujeresCL <- bd_mujeresAR[, c("ClaimID", "TiempoDeProcesamientoDias", "Total_Incurred_Cost_Claim")]
# Llamar a los renglones como ClaimID
rownames(bd_mujeresCL) <- bd_mujeresCL$ClaimID
bd_mujeresCL <- subset(bd_mujeresCL, select = -c(ClaimID))
# View(bd_mujeresCL)
# Columna de TiempoDeProcesamientoDias
IQR_TiempoDeProcesamientoDias <- IQR(bd_mujeresCL$TiempoDeProcesamientoDias)
IQR_TiempoDeProcesamientoDias## [1] 811
summary(bd_mujeresCL)## TiempoDeProcesamientoDias Total_Incurred_Cost_Claim
## Min. : 0.0 Min. : 0.0
## 1st Qu.: 33.0 1st Qu.: 0.0
## Median : 301.0 Median : 171.3
## Mean : 717.1 Mean : 4459.3
## 3rd Qu.: 844.0 3rd Qu.: 1125.3
## Max. :6912.0 Max. :388620.8
LI_TiempoDeProcesamientoDias <- 33 - 1.5*IQR_TiempoDeProcesamientoDias
LI_TiempoDeProcesamientoDias## [1] -1183.5
LS_TiempoDeProcesamientoDias <- 844 + 1.5*IQR_TiempoDeProcesamientoDias
LS_TiempoDeProcesamientoDias## [1] 2060.5
cat("LI_TiempoDeProcesamientoDias:", LI_TiempoDeProcesamientoDias, "\n")## LI_TiempoDeProcesamientoDias: -1183.5
cat("LS_TiempoDeProcesamientoDias:", LS_TiempoDeProcesamientoDias, "\n")## LS_TiempoDeProcesamientoDias: 2060.5
bd_mujeresCL <- bd_mujeresCL[bd_mujeresCL$TiempoDeProcesamientoDias <= 2061, ]
### Nota: se redondeó a 2061 porque el LS dió un resultado de 2060.5.
#Columna de Total_Incurred_Cost_Claim
IQR_Total_Incurred_Cost_Claim <- IQR(bd_mujeresCL$Total_Incurred_Cost_Claim)
IQR_Total_Incurred_Cost_Claim## [1] 1155.71
summary(bd_mujeresCL)## TiempoDeProcesamientoDias Total_Incurred_Cost_Claim
## Min. : 0.0 Min. : 0.0
## 1st Qu.: 4.0 1st Qu.: 0.0
## Median : 235.0 Median : 177.8
## Mean : 398.6 Mean : 3707.7
## 3rd Qu.: 548.0 3rd Qu.: 1155.7
## Max. :2058.0 Max. :246847.9
LI_Total_Incurred_Cost_Claim <- 0 - 1.5*IQR_Total_Incurred_Cost_Claim
LI_Total_Incurred_Cost_Claim## [1] -1733.565
LS_Total_Incurred_Cost_Claim <- 1125.3 + 1.5*IQR_Total_Incurred_Cost_Claim
LS_Total_Incurred_Cost_Claim## [1] 2858.865
cat("LI_Total_Incurred_Cost_Claim:", LI_Total_Incurred_Cost_Claim, "\n")## LI_Total_Incurred_Cost_Claim: -1733.565
cat("LS_Total_Incurred_Cost_Claim:", LS_Total_Incurred_Cost_Claim, "\n")## LS_Total_Incurred_Cost_Claim: 2858.865
bd_mujeresCL <- bd_mujeresCL[bd_mujeresCL$Total_Incurred_Cost_Claim <= 2859, ]
### Nota: se redondeó a 2859 porque el LS dió un resultado de 2858.865.
summary(bd_mujeresCL)## TiempoDeProcesamientoDias Total_Incurred_Cost_Claim
## Min. : 0.0 Min. : 0.00
## 1st Qu.: 0.0 1st Qu.: 0.00
## Median : 175.0 Median : 98.62
## Mean : 337.7 Mean : 372.37
## 3rd Qu.: 460.5 3rd Qu.: 428.27
## Max. :2058.0 Max. :2851.58
# O. Normalizar variables
bd_mujeresCL <- as.data.frame(scale(bd_mujeresCL))
# 1. Crear base de datos
bdmujeresCLUSTER <- bd_mujeresCL
# 2. Determinar el número de grupos
grupos <- 10
# 3. Realizar la clasificación
segmentos <- kmeans(bdmujeresCLUSTER,grupos)
# 4. Revisar la asignación de grupos
asignacion <- cbind(bdmujeresCLUSTER, cluster=segmentos$cluster)
# 5. Graficar asignaciones
# install.packages("ggplot2")
library(ggplot2)
# install.packages("factoextra")
library(factoextra)## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(segmentos, data = bdmujeresCLUSTER,
palette = c("darkorchid", "darkorange2", "aquamarine2", "pink", "blue", "darkolivegreen1", "salmon1", "skyblue3", "slategray2", "yellow"),
ellipse.type = "euclid",
star.plot = T,
repel = T,
ggtheme = theme())# 6. Optimizar la cantidad de grupos
library(cluster)
library(data.table)##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
set.seed(123)
optimizacion <- clusGap(bdmujeresCLUSTER, FUN = kmeans, nstart = 1, K.max = 10) ## Warning: did not converge in 10 iterations
plot(optimizacion, xlab = "Número de clusters K")Dado que el punto más alto en la gráfica es 10, eso nos indica que la cantidad de grupos óptimo es 10.