##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
df_hombres$ClaimantOpenedDate <- as.Date(df_hombres$ClaimantOpenedDate, format ="%d/%m/%y")
df_hombres$ClaimantClosedDate <- as.Date(df_hombres$ClaimantClosedDate, format ="%d/%m/%y")
df_hombres$IncidentDate <- as.Date(df_hombres$IncidentDate, format ="%d/%m/%y")
df_hombres$EmployerNotificationDate <- as.Date(df_hombres$EmployerNotificationDate, format ="%d/%m/%y")
df_hombres$ReceivedDate <- as.Date(df_hombres$ReceivedDate, format ="%d/%m/%y")
summary(df_hombres)## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Min. : -270 Min. : 0 Min. : 0
## 1st Qu.: 811585 1st Qu.: 75 1st Qu.: 0 1st Qu.: 0
## Median : 845656 Median : 250 Median : 0 Median : 0
## Mean :10174362 Mean : 7138 Mean : 2303 Mean : 91
## 3rd Qu.:22721079 3rd Qu.: 976 3rd Qu.: 0 3rd Qu.: 0
## Max. :62203891 Max. :4527291 Max. :1625903 Max. :130541
##
## IndemnityPaid OtherPaid ClaimStatus IncidentDate
## Min. : -475 Min. : -7820 Length:65125 Min. :1969-06-02
## 1st Qu.: 0 1st Qu.: 73 Class :character 1st Qu.:1999-08-16
## Median : 0 Median : 245 Mode :character Median :2004-03-10
## Mean : 3214 Mean : 3924 Mean :2004-04-07
## 3rd Qu.: 0 3rd Qu.: 896 3rd Qu.:2009-01-27
## Max. :640732 Max. :4129915 Max. :2014-06-27
##
## IncidentDescription ReturnToWorkDate AverageWeeklyWage ClaimantOpenedDate
## Length:65125 Length:65125 Length:65125 Min. :1980-07-03
## Class :character Class :character Class :character 1st Qu.:1999-09-27
## Mode :character Mode :character Mode :character Median :2004-04-22
## Mean :2004-05-25
## 3rd Qu.:2009-04-02
## Max. :2014-06-30
##
## ClaimantClosedDate EmployerNotificationDate ReceivedDate
## Min. :1999-06-01 Min. :1972-09-10 Min. :1980-07-03
## 1st Qu.:2005-03-31 1st Qu.:1999-11-05 1st Qu.:1999-10-01
## Median :2006-05-04 Median :2004-08-25 Median :2004-04-19
## Mean :2007-07-21 Mean :2004-10-03 Mean :2004-05-22
## 3rd Qu.:2009-12-17 3rd Qu.:2009-09-15 3rd Qu.:2009-02-19
## Max. :2014-06-30 Max. :2014-06-27 Max. :2033-09-18
## NA's :2316 NA's :11554
## IsDenied ClaimantAge_at_DOI Gender ClaimantType
## Min. :0.00000 Length:65125 Length:65125 Length:65125
## 1st Qu.:0.00000 Class :character Class :character Class :character
## Median :0.00000 Mode :character Mode :character Mode :character
## Mean :0.03995
## 3rd Qu.:0.00000
## Max. :1.00000
##
## InjuryNature BodyPartRegion BodyPart BillReviewALE
## Length:65125 Length:65125 Length:65125 Min. : -80.00
## Class :character Class :character Class :character 1st Qu.: 8.25
## Mode :character Mode :character Mode :character Median : 24.00
## Mean : 200.47
## 3rd Qu.: 65.64
## Max. :56475.30
## NA's :53799
## Hospital PhysicianOutpatient Rx
## Min. : -22.9 Min. : -549.5 Min. : -469.5
## 1st Qu.: 202.4 1st Qu.: 107.5 1st Qu.: 22.1
## Median : 582.1 Median : 223.2 Median : 59.6
## Mean : 4916.0 Mean : 1759.4 Mean : 1637.2
## 3rd Qu.: 2337.9 3rd Qu.: 695.3 3rd Qu.: 185.5
## Max. :2759604.0 Max. :1219766.6 Max. :631635.5
## NA's :55514 NA's :41065 NA's :56154
df_hombres$TotalPaid <- as.numeric(df_hombres$TotalPaid)
df_hombres$TotalReserves <- as.numeric(df_hombres$TotalReserves)
df_hombres$TotalRecovery <- as.numeric(df_hombres$TotalRecovery)
df_hombres$AverageWeeklyWage <- as.numeric(df_hombres$AverageWeeklyWage)## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Rows: 65,125
## Columns: 26
## $ ClaimID <int> 650915, 650916, 650917, 650918, 650920, 65092…
## $ TotalPaid <dbl> 11947.55, 0.00, 9295.89, 1026.29, 331.90, 198…
## $ TotalReserves <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ TotalRecovery <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ IndemnityPaid <dbl> 243.65, 0.00, 0.00, 0.00, 0.00, 604.67, 0.00,…
## $ OtherPaid <dbl> 11703.90, 0.00, 9295.89, 1026.29, 331.90, 138…
## $ ClaimStatus <chr> "C", "C", "C", "C", "C", "C", "C", "C", "C", …
## $ IncidentDate <date> 2009-06-17, 2009-06-26, 2009-06-25, 2009-06-…
## $ IncidentDescription <chr> "Employee was moving concrete rings and insta…
## $ ReturnToWorkDate <chr> "08/12/09", "26/06/09", "13/07/09", "12/06/09…
## $ AverageWeeklyWage <dbl> 639.59, NA, 1649.00, NA, NA, 755.79, 402.90, …
## $ ClaimantOpenedDate <date> 2009-07-02, 2009-07-02, 2009-07-02, 2009-07-…
## $ ClaimantClosedDate <date> 2010-07-20, 2009-11-25, 2010-03-30, 2010-03-…
## $ EmployerNotificationDate <date> 2009-06-29, 2009-07-01, 2009-06-25, 2009-06-…
## $ ReceivedDate <date> 2009-07-02, 2009-07-02, 2009-07-02, 2009-07-…
## $ IsDenied <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ ClaimantAge_at_DOI <dbl> 49, 49, 47, 61, 28, 38, NA, NA, NA, NA, NA, N…
## $ Gender <chr> "Male", "Male", "Male", "Male", "Male", "Male…
## $ ClaimantType <chr> "Indemnity", "Medical Only", "Indemnity", "Me…
## $ InjuryNature <chr> "Strain", "Strain", "Fracture", "Contusion", …
## $ BodyPartRegion <chr> "Trunk", "Trunk", "Upper Extremities", "Upper…
## $ BodyPart <chr> "Lower Back Area", "Lower Back Area", "Hand",…
## $ BillReviewALE <dbl> NA, NA, NA, NA, NA, NA, 8.00, NA, NA, 1478.35…
## $ Hospital <dbl> NA, NA, NA, NA, NA, NA, 11806.16, NA, NA, 411…
## $ PhysicianOutpatient <dbl> NA, NA, NA, NA, NA, NA, 5978.01, 109.22, 77.6…
## $ Rx <dbl> NA, NA, NA, NA, NA, NA, 11.97, NA, NA, 35426.…
df_hombres$Total_Incurred_Cost <- df_hombres$TotalReserves+df_hombres$TotalPaid-df_hombres$TotalRecovery
df_hombres$TP <- difftime(df_hombres$ClaimantClosedDate, df_hombres$ClaimantOpenedDate, units ="days")
df_hombres$TP <- as.numeric(df_hombres$TP)
summary(df_hombres)## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Min. : -270 Min. : 0 Min. : 0
## 1st Qu.: 811585 1st Qu.: 75 1st Qu.: 0 1st Qu.: 0
## Median : 845656 Median : 250 Median : 0 Median : 0
## Mean :10174362 Mean : 7138 Mean : 2303 Mean : 91
## 3rd Qu.:22721079 3rd Qu.: 976 3rd Qu.: 0 3rd Qu.: 0
## Max. :62203891 Max. :4527291 Max. :1625903 Max. :130541
##
## IndemnityPaid OtherPaid ClaimStatus IncidentDate
## Min. : -475 Min. : -7820 Length:65125 Min. :1969-06-02
## 1st Qu.: 0 1st Qu.: 73 Class :character 1st Qu.:1999-08-16
## Median : 0 Median : 245 Mode :character Median :2004-03-10
## Mean : 3214 Mean : 3924 Mean :2004-04-07
## 3rd Qu.: 0 3rd Qu.: 896 3rd Qu.:2009-01-27
## Max. :640732 Max. :4129915 Max. :2014-06-27
##
## IncidentDescription ReturnToWorkDate AverageWeeklyWage
## Length:65125 Length:65125 Min. : 0.0
## Class :character Class :character 1st Qu.: 300.0
## Mode :character Mode :character Median : 504.8
## Mean : 673.0
## 3rd Qu.: 686.5
## Max. :2024000.0
## NA's :41338
## ClaimantOpenedDate ClaimantClosedDate EmployerNotificationDate
## Min. :1980-07-03 Min. :1999-06-01 Min. :1972-09-10
## 1st Qu.:1999-09-27 1st Qu.:2005-03-31 1st Qu.:1999-11-05
## Median :2004-04-22 Median :2006-05-04 Median :2004-08-25
## Mean :2004-05-25 Mean :2007-07-21 Mean :2004-10-03
## 3rd Qu.:2009-04-02 3rd Qu.:2009-12-17 3rd Qu.:2009-09-15
## Max. :2014-06-30 Max. :2014-06-30 Max. :2014-06-27
## NA's :2316 NA's :11554
## ReceivedDate IsDenied ClaimantAge_at_DOI Gender
## Min. :1980-07-03 Min. :0.00000 Min. :-7951.00 Length:65125
## 1st Qu.:1999-10-01 1st Qu.:0.00000 1st Qu.: 32.00 Class :character
## Median :2004-04-19 Median :0.00000 Median : 42.00 Mode :character
## Mean :2004-05-22 Mean :0.03995 Mean : 39.93
## 3rd Qu.:2009-02-19 3rd Qu.:0.00000 3rd Qu.: 51.00
## Max. :2033-09-18 Max. :1.00000 Max. : 94.00
## NA's :18837
## ClaimantType InjuryNature BodyPartRegion BodyPart
## Length:65125 Length:65125 Length:65125 Length:65125
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BillReviewALE Hospital PhysicianOutpatient Rx
## Min. : -80.00 Min. : -22.9 Min. : -549.5 Min. : -469.5
## 1st Qu.: 8.25 1st Qu.: 202.4 1st Qu.: 107.5 1st Qu.: 22.1
## Median : 24.00 Median : 582.1 Median : 223.2 Median : 59.6
## Mean : 200.47 Mean : 4916.0 Mean : 1759.4 Mean : 1637.2
## 3rd Qu.: 65.64 3rd Qu.: 2337.9 3rd Qu.: 695.3 3rd Qu.: 185.5
## Max. :56475.30 Max. :2759604.0 Max. :1219766.6 Max. :631635.5
## NA's :53799 NA's :55514 NA's :41065 NA's :56154
## Total_Incurred_Cost TP
## Min. : -10682 Min. : 0
## 1st Qu.: 72 1st Qu.: 178
## Median : 247 Median : 967
## Mean : 9350 Mean :1251
## 3rd Qu.: 998 3rd Qu.:2026
## Max. :4734750 Max. :7661
## NA's :2316
df_hombres_3 <- df_hombres
df_hombres_3 <- df_hombres_3[df_hombres_3$Total_Incurred_Cost > 0,]
df_hombres_3 <- df_hombres_3 %>%
filter(complete.cases(TP))
summary(df_hombres_3)## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Min. : 0.4 Min. :0 Min. : 0.00
## 1st Qu.: 809847 1st Qu.: 152.2 1st Qu.:0 1st Qu.: 0.00
## Median : 844908 Median : 341.5 Median :0 Median : 0.00
## Mean : 9510746 Mean : 5514.9 Mean :0 Mean : 83.25
## 3rd Qu.:22720599 3rd Qu.: 1223.9 3rd Qu.:0 3rd Qu.: 0.00
## Max. :61591294 Max. :1758919.5 Max. :0 Max. :130335.75
##
## IndemnityPaid OtherPaid ClaimStatus IncidentDate
## Min. : -475 Min. : -7820.5 Length:50997 Min. :1978-04-06
## 1st Qu.: 0 1st Qu.: 150.0 Class :character 1st Qu.:1999-11-05
## Median : 0 Median : 332.6 Mode :character Median :2004-02-09
## Mean : 2551 Mean : 2964.2 Mean :2004-02-15
## 3rd Qu.: 0 3rd Qu.: 1110.0 3rd Qu.:2008-08-14
## Max. :640732 Max. :1602337.1 Max. :2014-06-09
##
## IncidentDescription ReturnToWorkDate AverageWeeklyWage
## Length:50997 Length:50997 Min. : 0.0
## Class :character Class :character 1st Qu.: 300.0
## Mode :character Mode :character Median : 500.0
## Mean : 707.2
## 3rd Qu.: 682.5
## Max. :2024000.0
## NA's :32193
## ClaimantOpenedDate ClaimantClosedDate EmployerNotificationDate
## Min. :1987-06-10 Min. :1999-06-01 Min. :1978-04-07
## 1st Qu.:1999-12-13 1st Qu.:2005-03-31 1st Qu.:2000-03-13
## Median :2004-03-15 Median :2006-06-07 Median :2004-08-16
## Mean :2004-03-25 Mean :2007-07-18 Mean :2004-09-11
## 3rd Qu.:2008-09-16 3rd Qu.:2010-02-10 3rd Qu.:2009-04-21
## Max. :2014-06-12 Max. :2014-06-30 Max. :2014-06-09
## NA's :10514
## ReceivedDate IsDenied ClaimantAge_at_DOI Gender
## Min. :1987-06-10 Min. :0.00000 Min. :-7951.00 Length:50997
## 1st Qu.:1999-12-17 1st Qu.:0.00000 1st Qu.: 32.00 Class :character
## Median :2004-03-17 Median :0.00000 Median : 41.00 Mode :character
## Mean :2004-03-27 Mean :0.02998 Mean : 39.72
## 3rd Qu.:2008-09-08 3rd Qu.:0.00000 3rd Qu.: 50.00
## Max. :2033-09-18 Max. :1.00000 Max. : 94.00
## NA's :14136
## ClaimantType InjuryNature BodyPartRegion BodyPart
## Length:50997 Length:50997 Length:50997 Length:50997
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BillReviewALE Hospital PhysicianOutpatient Rx
## Min. : -80.00 Min. : 0.0 Min. : -549.5 Min. : -38.9
## 1st Qu.: 8.25 1st Qu.: 205.4 1st Qu.: 109.0 1st Qu.: 22.6
## Median : 24.00 Median : 586.6 Median : 225.7 Median : 60.2
## Mean : 189.69 Mean : 4958.5 Mean : 1746.3 Mean : 1451.4
## 3rd Qu.: 65.60 3rd Qu.: 2323.4 3rd Qu.: 699.1 3rd Qu.: 187.6
## Max. :46055.35 Max. :2759604.0 Max. :1219766.6 Max. :318013.2
## NA's :41751 NA's :43310 NA's :31731 NA's :43757
## Total_Incurred_Cost TP
## Min. : 0.1 Min. : 0
## 1st Qu.: 150.0 1st Qu.: 197
## Median : 336.0 Median : 933
## Mean : 5431.6 Mean :1210
## 3rd Qu.: 1192.4 3rd Qu.:1891
## Max. :1758919.5 Max. :7661
##
# Buscar y si es necesario eliminar outliers
boxplot(df_hombres_3$Total_Incurred_Cost, horizontal = TRUE)df_hombres_3 <- df_hombres_3[df_hombres_3$Total_Incurred_Cost<500000,]
# Para tiempo de Procesamiento
boxplot(df_hombres_3$TP, horizontal = TRUE)df_hombresC <- df_hombres_3[,c("ClaimID", "Total_Incurred_Cost", "TP")]
# Llamar a los renglones como ClaimID
rownames(df_hombresC) <- df_hombresC$ClaimID
df_hombresC <- subset(df_hombresC, select = -c(ClaimID))
IQR_TP <- IQR(df_hombresC$TP)
IQR_TP## [1] 1694
## Total_Incurred_Cost TP
## Min. : 0.1 Min. : 0
## 1st Qu.: 150.0 1st Qu.: 197
## Median : 336.0 Median : 932
## Mean : 5258.7 Mean :1210
## 3rd Qu.: 1191.9 3rd Qu.:1891
## Max. :470622.7 Max. :7661
## [1] -2344
## [1] 4432
## [1] 1045.553
## [1] -1418.329
## [1] 2760.229
## Total_Incurred_Cost TP
## Min. : 0.07 Min. : 0
## 1st Qu.: 120.59 1st Qu.: 158
## Median : 226.02 Median : 908
## Mean : 316.89 Mean :1171
## 3rd Qu.: 425.93 3rd Qu.:1887
## Max. :1193.32 Max. :4431
#0.Normalizar objetos
df_hombresC <- as.data.frame(scale(df_hombresC))
# 1. Crear base de datos
CL <- df_hombresC
# 2. Determinar el número de grupos
grupos <- 7
# 3. Realizar la clasificación
segmentos <- kmeans(CL, grupos)
# 4. Revisar la asignación de grupos
asignacion <- cbind(CL, cluster=segmentos$cluster)
fviz_cluster(segmentos, data=CL,
palette=c("red", "blue", "black", "green", "orange", "pink", "purple"),
ellipse.type="euclid", star.plot = T, repel = T, ggtheme = theme())##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 1895550)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 1895550)