Este es un proyecto proveniente de North Carolina, US, quienes gestionan
reclamaciones de compensación laboral para otros clientes. Buscan
asesoramiento sobre cómo se puede utilizar el análisis de datos para
identificar los principales factores de los costos de las reclamaciones
y el tiempo para procesar las reclamaciones. La empresa está interesada
en utilizar el análisis de datos para ser eficiente y receptiva ante los
reclamantes.
library(readxl)
library(tidyverse)## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(rpart)
library(rpart.plot)
library(factoextra)## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(data.table)##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
df1 <- read_xlsx("/Users/benjaminreyessanchez/Downloads/ClaimsData2018.xlsx")
df2 <- read.csv("/Users/benjaminreyessanchez/Downloads/TransactionsSummary2018.csv")df <- left_join(df1, df2, by = "ClaimID")
str(df)## tibble [134,004 × 26] (S3: tbl_df/tbl/data.frame)
## $ ClaimID : num [1:134004] 650915 650916 650917 650918 650919 ...
## $ TotalPaid : chr [1:134004] "11947.55" "0.00" "9295.89" "1026.29" ...
## $ TotalReserves : chr [1:134004] "0.00" "0.00" "0.00" "0.00" ...
## $ TotalRecovery : chr [1:134004] "0.00" "0.00" "0.00" "0.00" ...
## $ IndemnityPaid : chr [1:134004] "243.65" "0.00" "0.00" "0.00" ...
## $ OtherPaid : chr [1:134004] "11703.90" "0.00" "9295.89" "1026.29" ...
## $ ClaimStatus : chr [1:134004] "C" "C" "C" "C" ...
## $ IncidentDate : POSIXct[1:134004], format: "2009-06-17" "2009-06-26" ...
## $ IncidentDescription : chr [1:134004] "Employee was moving concrete rings and installing a meter. He strained lower back." "Employee was pulling lining. He felt a pop in the back causing a strain." "Employee was in the restroom. He heard a scream from another restroom that startled him and he fell on his left"| __truncated__ "Employee was unloading truck using a pallet jack to unload heavy equipment. Heavy load caused him to lose balan"| __truncated__ ...
## $ ReturnToWorkDate : POSIXct[1:134004], format: "2009-12-08" "2009-06-26" ...
## $ AverageWeeklyWage : chr [1:134004] "639.59" "NULL" "1649.00" "NULL" ...
## $ ClaimantOpenedDate : POSIXct[1:134004], format: "2009-07-02" "2009-07-02" ...
## $ ClaimantClosedDate : POSIXct[1:134004], format: "2010-07-20" "2009-11-25" ...
## $ EmployerNotificationDate: POSIXct[1:134004], format: "2009-06-29" "2009-07-01" ...
## $ ReceivedDate : POSIXct[1:134004], format: "2009-07-02" "2009-07-02" ...
## $ IsDenied : chr [1:134004] "0" "0" "0" "0" ...
## $ ClaimantAge_at_DOI : chr [1:134004] "49" "49" "47" "61" ...
## $ Gender : chr [1:134004] "Male" "Male" "Male" "Male" ...
## $ ClaimantType : chr [1:134004] "Indemnity" "Medical Only" "Indemnity" "Medical Only" ...
## $ InjuryNature : chr [1:134004] "Strain" "Strain" "Fracture" "Contusion" ...
## $ BodyPartRegion : chr [1:134004] "Trunk" "Trunk" "Upper Extremities" "Upper Extremities" ...
## $ BodyPart : chr [1:134004] "Lower Back Area" "Lower Back Area" "Hand" "Shoulder(S)" ...
## $ BillReviewALE : num [1:134004] NA NA NA NA NA NA NA NA 16 8 ...
## $ Hospital : num [1:134004] NA NA NA NA NA ...
## $ PhysicianOutpatient : num [1:134004] NA NA NA NA NA ...
## $ Rx : num [1:134004] NA NA NA NA NA ...
summary(df)## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Length:134004 Length:134004 Length:134004
## 1st Qu.: 811125 Class :character Class :character Class :character
## Median : 844626 Mode :character Mode :character Mode :character
## Mean :10149151
## 3rd Qu.:22716506
## Max. :62203891
##
## IndemnityPaid OtherPaid ClaimStatus
## Length:134004 Length:134004 Length:134004
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## IncidentDate IncidentDescription
## Min. :1947-02-24 00:00:00.0 Length:134004
## 1st Qu.:1998-12-21 00:00:00.0 Class :character
## Median :2004-01-05 00:00:00.0 Mode :character
## Mean :2003-12-08 16:48:47.2
## 3rd Qu.:2009-02-02 00:00:00.0
## Max. :2014-06-27 00:00:00.0
##
## ReturnToWorkDate AverageWeeklyWage
## Min. :1976-10-29 00:00:00.00 Length:134004
## 1st Qu.:2002-04-25 00:00:00.00 Class :character
## Median :2007-07-09 00:00:00.00 Mode :character
## Mean :2006-06-01 15:30:12.00
## 3rd Qu.:2011-06-01 00:00:00.00
## Max. :2015-05-07 00:00:00.00
## NA's :58637
## ClaimantOpenedDate ClaimantClosedDate
## Min. :1947-02-24 00:00:00.0 Min. :1999-06-01 00:00:00.00
## 1st Qu.:1999-02-09 00:00:00.0 1st Qu.:2005-03-31 00:00:00.00
## Median :2004-02-17 00:00:00.0 Median :2006-04-04 12:00:00.00
## Mean :2004-01-23 11:32:07.5 Mean :2007-05-24 02:11:30.00
## 3rd Qu.:2009-04-09 06:00:00.0 3rd Qu.:2009-11-11 00:00:00.00
## Max. :2014-06-30 00:00:00.0 Max. :2014-06-30 00:00:00.00
## NA's :4678
## EmployerNotificationDate ReceivedDate
## Min. :1972-09-10 00:00:00.0 Min. :1947-02-24 00:00:00.0
## 1st Qu.:2000-03-13 00:00:00.0 1st Qu.:1999-02-09 00:00:00.0
## Median :2004-12-28 00:00:00.0 Median :2004-02-13 00:00:00.0
## Mean :2005-08-29 04:49:34.2 Mean :2004-07-19 11:29:03.0
## 3rd Qu.:2009-11-03 00:00:00.0 3rd Qu.:2009-02-27 00:00:00.0
## Max. :9999-07-21 00:00:00.0 Max. :9999-07-21 00:00:00.0
## NA's :22288
## IsDenied ClaimantAge_at_DOI Gender ClaimantType
## Length:134004 Length:134004 Length:134004 Length:134004
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## InjuryNature BodyPartRegion BodyPart BillReviewALE
## Length:134004 Length:134004 Length:134004 Min. : -456.00
## Class :character Class :character Class :character 1st Qu.: 8.25
## Mode :character Mode :character Mode :character Median : 24.00
## Mean : 189.89
## 3rd Qu.: 65.20
## Max. :56475.30
## NA's :109798
## Hospital PhysicianOutpatient Rx
## Min. : -12570.4 Min. : -549.5 Min. : -469.5
## 1st Qu.: 202.5 1st Qu.: 107.3 1st Qu.: 22.7
## Median : 580.5 Median : 222.7 Median : 60.0
## Mean : 4773.9 Mean : 1762.7 Mean : 1459.0
## 3rd Qu.: 2298.5 3rd Qu.: 689.9 3rd Qu.: 178.3
## Max. :2759604.0 Max. :1481468.5 Max. :631635.5
## NA's :113050 NA's :81900 NA's :114237
sapply(df, function(x) sum(is.na(x)))## ClaimID TotalPaid TotalReserves
## 0 0 0
## TotalRecovery IndemnityPaid OtherPaid
## 0 0 0
## ClaimStatus IncidentDate IncidentDescription
## 0 0 0
## ReturnToWorkDate AverageWeeklyWage ClaimantOpenedDate
## 58637 0 0
## ClaimantClosedDate EmployerNotificationDate ReceivedDate
## 4678 22288 0
## IsDenied ClaimantAge_at_DOI Gender
## 0 0 0
## ClaimantType InjuryNature BodyPartRegion
## 0 0 0
## BodyPart BillReviewALE Hospital
## 0 109798 113050
## PhysicianOutpatient Rx
## 81900 114237
Podemos Observar lo siguiente: - Tenemos muchos valores como NA. - Una gran parte de las variables están determinadas como carácter y hay que cambiarlas a numérico. - Hay columnas que podemos eliminar que no nos sirven para ninguno de los análisis.
df$TotalPaid <- as.numeric(df$TotalPaid)
df$TotalReserves <- as.numeric(df$TotalReserves)
df$TotalRecovery <- as.numeric(df$TotalRecovery)
df$IndemnityPaid <- as.numeric(df$IndemnityPaid)
df$OtherPaid <- as.numeric(df$OtherPaid)
df$AverageWeeklyWage <- as.numeric(df$AverageWeeklyWage)## Warning: NAs introduced by coercion
df$ClaimantAge_at_DOI <- as.numeric(df$ClaimantAge_at_DOI)## Warning: NAs introduced by coercion
summary(df)## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Min. : -270 Min. : 0 Min. : 0.00
## 1st Qu.: 811125 1st Qu.: 60 1st Qu.: 0 1st Qu.: 0.00
## Median : 844626 Median : 235 Median : 0 Median : 0.00
## Mean :10149151 Mean : 6746 Mean : 2233 Mean : 68.88
## 3rd Qu.:22716506 3rd Qu.: 938 3rd Qu.: 0 3rd Qu.: 0.00
## Max. :62203891 Max. :4527291 Max. :2069575 Max. :130541.03
##
## IndemnityPaid OtherPaid ClaimStatus
## Min. : -475 Min. : -7820 Length:134004
## 1st Qu.: 0 1st Qu.: 58 Class :character
## Median : 0 Median : 230 Mode :character
## Mean : 3061 Mean : 3685
## 3rd Qu.: 0 3rd Qu.: 855
## Max. :640732 Max. :4129915
##
## IncidentDate IncidentDescription
## Min. :1947-02-24 00:00:00.0 Length:134004
## 1st Qu.:1998-12-21 00:00:00.0 Class :character
## Median :2004-01-05 00:00:00.0 Mode :character
## Mean :2003-12-08 16:48:47.2
## 3rd Qu.:2009-02-02 00:00:00.0
## Max. :2014-06-27 00:00:00.0
##
## ReturnToWorkDate AverageWeeklyWage
## Min. :1976-10-29 00:00:00.00 Min. : 0.0
## 1st Qu.:2002-04-25 00:00:00.00 1st Qu.: 300.0
## Median :2007-07-09 00:00:00.00 Median : 492.0
## Mean :2006-06-01 15:30:12.00 Mean : 587.3
## 3rd Qu.:2011-06-01 00:00:00.00 3rd Qu.: 660.4
## Max. :2015-05-07 00:00:00.00 Max. :2024000.0
## NA's :58637 NA's :84924
## ClaimantOpenedDate ClaimantClosedDate
## Min. :1947-02-24 00:00:00.0 Min. :1999-06-01 00:00:00.00
## 1st Qu.:1999-02-09 00:00:00.0 1st Qu.:2005-03-31 00:00:00.00
## Median :2004-02-17 00:00:00.0 Median :2006-04-04 12:00:00.00
## Mean :2004-01-23 11:32:07.5 Mean :2007-05-24 02:11:30.00
## 3rd Qu.:2009-04-09 06:00:00.0 3rd Qu.:2009-11-11 00:00:00.00
## Max. :2014-06-30 00:00:00.0 Max. :2014-06-30 00:00:00.00
## NA's :4678
## EmployerNotificationDate ReceivedDate
## Min. :1972-09-10 00:00:00.0 Min. :1947-02-24 00:00:00.0
## 1st Qu.:2000-03-13 00:00:00.0 1st Qu.:1999-02-09 00:00:00.0
## Median :2004-12-28 00:00:00.0 Median :2004-02-13 00:00:00.0
## Mean :2005-08-29 04:49:34.2 Mean :2004-07-19 11:29:03.0
## 3rd Qu.:2009-11-03 00:00:00.0 3rd Qu.:2009-02-27 00:00:00.0
## Max. :9999-07-21 00:00:00.0 Max. :9999-07-21 00:00:00.0
## NA's :22288
## IsDenied ClaimantAge_at_DOI Gender ClaimantType
## Length:134004 Min. :-8000.00 Length:134004 Length:134004
## Class :character 1st Qu.: 33.00 Class :character Class :character
## Mode :character Median : 42.00 Mode :character Mode :character
## Mean : 39.85
## 3rd Qu.: 51.00
## Max. : 94.00
## NA's :45078
## InjuryNature BodyPartRegion BodyPart BillReviewALE
## Length:134004 Length:134004 Length:134004 Min. : -456.00
## Class :character Class :character Class :character 1st Qu.: 8.25
## Mode :character Mode :character Mode :character Median : 24.00
## Mean : 189.89
## 3rd Qu.: 65.20
## Max. :56475.30
## NA's :109798
## Hospital PhysicianOutpatient Rx
## Min. : -12570.4 Min. : -549.5 Min. : -469.5
## 1st Qu.: 202.5 1st Qu.: 107.3 1st Qu.: 22.7
## Median : 580.5 Median : 222.7 Median : 60.0
## Mean : 4773.9 Mean : 1762.7 Mean : 1459.0
## 3rd Qu.: 2298.5 3rd Qu.: 689.9 3rd Qu.: 178.3
## Max. :2759604.0 Max. :1481468.5 Max. :631635.5
## NA's :113050 NA's :81900 NA's :114237
Ahora eliminamos los valores negativos que no pueden existir dentro de nuestra base de datos.
df_inicial<-df
df_inicial<- subset(df_inicial, TotalPaid>0)
df_inicial<- subset(df_inicial, IndemnityPaid>0)
summary(df_inicial)## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Min. : 27 Min. : 0 Min. : 0.0
## 1st Qu.: 813720 1st Qu.: 3766 1st Qu.: 0 1st Qu.: 0.0
## Median : 847241 Median : 14088 Median : 0 Median : 0.0
## Mean :10104568 Mean : 44975 Mean : 15078 Mean : 407.5
## 3rd Qu.:22716889 3rd Qu.: 40479 3rd Qu.: 0 3rd Qu.: 0.0
## Max. :62032209 Max. :4527291 Max. :2069575 Max. :130541.0
##
## IndemnityPaid OtherPaid ClaimStatus
## Min. : 15.5 Min. : -7820 Length:18277
## 1st Qu.: 1265.5 1st Qu.: 1608 Class :character
## Median : 5663.6 Median : 6773 Mode :character
## Mean : 22439.6 Mean : 22535
## 3rd Qu.: 17481.1 3rd Qu.: 20426
## Max. :640732.3 Max. :4129915
##
## IncidentDate IncidentDescription
## Min. :1967-02-06 00:00:00.00 Length:18277
## 1st Qu.:1998-08-17 00:00:00.00 Class :character
## Median :2003-11-13 00:00:00.00 Mode :character
## Mean :2003-05-16 07:07:44.25
## 3rd Qu.:2008-09-15 00:00:00.00
## Max. :2014-06-18 00:00:00.00
##
## ReturnToWorkDate AverageWeeklyWage
## Min. :1984-01-01 00:00:00.00 Min. : 1.0
## 1st Qu.:2000-12-10 12:00:00.00 1st Qu.: 438.6
## Median :2005-09-12 00:00:00.00 Median : 541.5
## Mean :2005-05-14 09:48:15.77 Mean : 595.5
## 3rd Qu.:2010-01-20 00:00:00.00 3rd Qu.: 699.6
## Max. :2014-06-19 00:00:00.00 Max. :15062.3
## NA's :6417 NA's :3918
## ClaimantOpenedDate ClaimantClosedDate
## Min. :1967-02-06 00:00:00.00 Min. :1999-07-12 00:00:00.00
## 1st Qu.:1998-11-20 00:00:00.00 1st Qu.:2005-03-31 00:00:00.00
## Median :2004-02-17 00:00:00.00 Median :2006-06-05 00:00:00.00
## Mean :2003-09-11 12:50:56.16 Mean :2007-04-17 09:40:29.75
## 3rd Qu.:2009-06-16 00:00:00.00 3rd Qu.:2010-04-26 00:00:00.00
## Max. :2014-06-24 00:00:00.00 Max. :2014-06-30 00:00:00.00
## NA's :2262
## EmployerNotificationDate ReceivedDate
## Min. :1978-04-07 00:00:00.00 Min. :1967-02-06 00:00:00.00
## 1st Qu.:2000-06-23 18:00:00.00 1st Qu.:1998-11-05 00:00:00.00
## Median :2005-03-16 12:00:00.00 Median :2004-01-05 00:00:00.00
## Mean :2005-06-15 21:45:38.46 Mean :2003-10-23 12:36:54.72
## 3rd Qu.:2009-08-15 12:00:00.00 3rd Qu.:2008-10-01 00:00:00.00
## Max. :9999-07-01 00:00:00.00 Max. :4996-02-17 00:00:00.00
## NA's :4237
## IsDenied ClaimantAge_at_DOI Gender ClaimantType
## Length:18277 Min. :-5973.00 Length:18277 Length:18277
## Class :character 1st Qu.: 37.00 Class :character Class :character
## Mode :character Median : 45.00 Mode :character Mode :character
## Mean : 43.56
## 3rd Qu.: 53.00
## Max. : 87.00
## NA's :5549
## InjuryNature BodyPartRegion BodyPart BillReviewALE
## Length:18277 Length:18277 Length:18277 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 8.25
## Mode :character Mode :character Mode :character Median : 24.00
## Mean : 181.49
## 3rd Qu.: 64.00
## Max. :15836.77
## NA's :15167
## Hospital PhysicianOutpatient Rx
## Min. : 0.0 Min. : -18.8 Min. : 0.00
## 1st Qu.: 185.7 1st Qu.: 104.0 1st Qu.: 21.66
## Median : 527.9 Median : 216.2 Median : 56.91
## Mean : 4185.7 Mean : 1621.1 Mean : 1388.27
## 3rd Qu.: 2047.0 3rd Qu.: 655.3 3rd Qu.: 175.90
## Max. :408665.0 Max. :318039.6 Max. :187627.17
## NA's :15339 NA's :10953 NA's :15523
#Solamente aplicado a dos variables ya que si se hicieran con las 3 variables faltantes que contienen negativos se reducen demasiado el número de obs.No hice una nueva columna para costo total incurrido porque me di cuenta que la variable “TotalPaid” ya es la sumatoria total del costo total de cada paciente, tomando en cuenta TotalReserves, IndemnityPaid y OtherPaid
#PARA EL COSTO TOTAL INCURRIDO
df_regr <- df_inicial
boxplot(df_inicial$TotalPaid, horizontal = TRUE)df_regr <- df_inicial[df_inicial$TotalPaid > 0 & df_inicial$TotalPaid < 150000, ]
boxplot(df_regr$TotalPaid, horizontal = TRUE)df_regr <- subset(df_regr, select = -IncidentDescription)
df_regr$AverageWeeklyWage <- replace(df_regr$AverageWeeklyWage, is.na(df_regr$AverageWeeklyWage), 592.9)summary(df_regr)## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Min. : 26.88 Min. : 0 Min. : 0.0
## 1st Qu.: 814288 1st Qu.: 3330.54 1st Qu.: 0 1st Qu.: 0.0
## Median : 850173 Median : 11847.95 Median : 0 Median : 0.0
## Mean :10434983 Mean : 23907.09 Mean : 4933 Mean : 307.4
## 3rd Qu.:22717577 3rd Qu.: 31129.08 3rd Qu.: 0 3rd Qu.: 0.0
## Max. :62032209 Max. :149949.17 Max. :659235 Max. :85000.0
##
## IndemnityPaid OtherPaid ClaimStatus
## Min. : 15.5 Min. : -7820 Length:16857
## 1st Qu.: 1079.4 1st Qu.: 1413 Class :character
## Median : 4664.3 Median : 5614 Mode :character
## Mean : 11742.7 Mean : 12164
## 3rd Qu.: 13089.5 3rd Qu.: 16237
## Max. :147420.0 Max. :137142
##
## IncidentDate ReturnToWorkDate
## Min. :1971-11-24 00:00:00.00 Min. :1984-01-01 00:00:00.00
## 1st Qu.:1998-07-18 00:00:00.00 1st Qu.:2000-10-19 18:00:00.00
## Median :2003-10-09 00:00:00.00 Median :2005-06-28 00:00:00.00
## Mean :2003-05-21 23:48:48.55 Mean :2005-03-26 13:16:08.78
## 3rd Qu.:2008-10-20 00:00:00.00 3rd Qu.:2009-11-18 00:00:00.00
## Max. :2014-06-18 00:00:00.00 Max. :2014-06-19 00:00:00.00
## NA's :5605
## AverageWeeklyWage ClaimantOpenedDate
## Min. : 1.0 Min. :1971-11-24 00:00:00.00
## 1st Qu.: 467.0 1st Qu.:1998-09-22 00:00:00.00
## Median : 592.9 Median :2003-11-18 00:00:00.00
## Mean : 592.9 Mean :2003-07-29 11:06:44.26
## 3rd Qu.: 638.4 3rd Qu.:2009-03-11 00:00:00.00
## Max. :15062.3 Max. :2014-06-24 00:00:00.00
##
## ClaimantClosedDate EmployerNotificationDate
## Min. :1999-07-12 00:00:00.00 Min. :1978-04-07 00:00:00.00
## 1st Qu.:2005-03-31 00:00:00.00 1st Qu.:2000-06-21 00:00:00.00
## Median :2006-03-31 00:00:00.00 Median :2005-04-09 00:00:00.00
## Mean :2007-03-24 08:30:53.72 Mean :2005-08-15 23:35:58.44
## 3rd Qu.:2010-03-29 00:00:00.00 3rd Qu.:2009-11-04 18:00:00.00
## Max. :2014-06-30 00:00:00.00 Max. :9999-07-01 00:00:00.00
## NA's :1445 NA's :3971
## ReceivedDate IsDenied ClaimantAge_at_DOI
## Min. :1971-11-24 00:00:00.00 Length:16857 Min. :-5973.00
## 1st Qu.:1998-09-22 00:00:00.00 Class :character 1st Qu.: 36.00
## Median :2003-11-19 00:00:00.00 Mode :character Median : 45.00
## Mean :2003-10-25 04:58:07.87 Mean : 43.29
## 3rd Qu.:2008-11-12 00:00:00.00 3rd Qu.: 53.00
## Max. :4996-02-17 00:00:00.00 Max. : 87.00
## NA's :5366
## Gender ClaimantType InjuryNature BodyPartRegion
## Length:16857 Length:16857 Length:16857 Length:16857
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BodyPart BillReviewALE Hospital PhysicianOutpatient
## Length:16857 Min. : 0.00 Min. : 0 Min. : -18.8
## Class :character 1st Qu.: 8.25 1st Qu.: 187 1st Qu.: 105.0
## Mode :character Median : 24.00 Median : 536 Median : 216.9
## Mean : 179.20 Mean : 4292 Mean : 1662.5
## 3rd Qu.: 64.00 3rd Qu.: 2075 3rd Qu.: 656.3
## Max. :15836.77 Max. :408665 Max. :318039.6
## NA's :14035 NA's :14200 NA's :10226
## Rx
## Min. : 0.00
## 1st Qu.: 21.70
## Median : 56.38
## Mean : 1326.86
## 3rd Qu.: 177.70
## Max. :187627.17
## NA's :14338
count(df_regr, ClaimantType, sort = TRUE)## # A tibble: 3 × 2
## ClaimantType n
## <chr> <int>
## 1 Indemnity 16821
## 2 Medical Only 26
## 3 Report Only 10
count(df_regr, InjuryNature, sort = TRUE)## # A tibble: 45 × 2
## InjuryNature n
## <chr> <int>
## 1 Strain 5184
## 2 Contusion 2842
## 3 Non-Standard Code 2392
## 4 Sprain 1738
## 5 Fracture 1345
## 6 All Other Specific Injuries, Noc 1162
## 7 Laceration 411
## 8 Multiple Physical Injuries Only 363
## 9 Carpal Tunnel Syndrome 204
## 10 Dislocation 136
## # ℹ 35 more rows
count(df_regr, BodyPartRegion, sort = TRUE)## # A tibble: 7 × 2
## BodyPartRegion n
## <chr> <int>
## 1 Upper Extremities 4398
## 2 Lower Extremities 4057
## 3 Trunk 3077
## 4 Non-Standard Code 2317
## 5 Multiple Body Parts 1889
## 6 Head 630
## 7 Neck 489
count(df_regr, IsDenied, sort = TRUE)## # A tibble: 2 × 2
## IsDenied n
## <chr> <int>
## 1 0 16399
## 2 1 458
count(df_regr, Gender, sort = TRUE)## # A tibble: 3 × 2
## Gender n
## <chr> <int>
## 1 Male 8023
## 2 Female 7037
## 3 Not Available 1797
count(df_regr, ClaimStatus, sort = TRUE)## # A tibble: 3 × 2
## ClaimStatus n
## <chr> <int>
## 1 C 15412
## 2 O 1079
## 3 R 366
#PARA EL TIEMPO DE PROCESAMIENTO
#Utilizando las Variables de las Fechas.
df_regr$processing_time <- as.numeric(df_regr$ClaimantClosedDate - df_regr$ClaimantOpenedDate, units = "days")
head(df_regr$processing_time)## [1] 383 673 235 2080 2087 2347
#PARA EL COSTO TOTAL INCURRIDO
regresion_cti <- lm(TotalPaid ~ Gender + processing_time + AverageWeeklyWage + BodyPartRegion, data = df_regr )
summary (regresion_cti)##
## Call:
## lm(formula = TotalPaid ~ Gender + processing_time + AverageWeeklyWage +
## BodyPartRegion, data = df_regr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -81253 -16178 -9085 5727 140331
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2810.2072 1332.4317 2.109 0.034954 *
## GenderMale 1964.1063 466.2457 4.213 2.54e-05 ***
## GenderNot Available 2960.8020 1270.2260 2.331 0.019770 *
## processing_time 3.4264 0.2051 16.706 < 2e-16 ***
## AverageWeeklyWage 15.2510 0.9181 16.612 < 2e-16 ***
## BodyPartRegionLower Extremities 4221.1216 1230.4767 3.430 0.000604 ***
## BodyPartRegionMultiple Body Parts 8048.6200 1313.0744 6.130 9.02e-10 ***
## BodyPartRegionNeck 6304.1084 1833.5618 3.438 0.000587 ***
## BodyPartRegionNon-Standard Code -9219.4129 1585.7176 -5.814 6.22e-09 ***
## BodyPartRegionTrunk 3497.3291 1254.7274 2.787 0.005321 **
## BodyPartRegionUpper Extremities 4038.4562 1223.0723 3.302 0.000963 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26740 on 15401 degrees of freedom
## (1445 observations deleted due to missingness)
## Multiple R-squared: 0.04246, Adjusted R-squared: 0.04184
## F-statistic: 68.3 on 10 and 15401 DF, p-value: < 2.2e-16
#PARA EL TIEMPO DE PROCESAMIENTO
regresion_pt <- lm(processing_time ~ Gender + TotalPaid + AverageWeeklyWage + BodyPartRegion + ClaimantType, data = df_regr )
summary (regresion_pt)##
## Call:
## lm(formula = processing_time ~ Gender + TotalPaid + AverageWeeklyWage +
## BodyPartRegion + ClaimantType, data = df_regr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2925.9 -804.7 -188.7 752.1 7257.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.766e+03 4.989e+01 35.391 < 2e-16 ***
## GenderMale 8.950e+01 1.815e+01 4.932 8.23e-07 ***
## GenderNot Available 5.292e+02 4.927e+01 10.741 < 2e-16 ***
## TotalPaid 5.189e-03 3.109e-04 16.688 < 2e-16 ***
## AverageWeeklyWage -1.040e+00 3.507e-02 -29.657 < 2e-16 ***
## BodyPartRegionLower Extremities -3.004e+01 4.792e+01 -0.627 0.5307
## BodyPartRegionMultiple Body Parts 4.066e+02 5.107e+01 7.962 1.82e-15 ***
## BodyPartRegionNeck -1.104e+02 7.140e+01 -1.547 0.1219
## BodyPartRegionNon-Standard Code 1.353e+03 6.084e+01 22.244 < 2e-16 ***
## BodyPartRegionTrunk 2.646e+01 4.885e+01 0.542 0.5881
## BodyPartRegionUpper Extremities -6.006e+01 4.763e+01 -1.261 0.2073
## ClaimantTypeMedical Only -5.019e+02 2.045e+02 -2.455 0.0141 *
## ClaimantTypeReport Only 7.892e+02 3.299e+02 2.392 0.0168 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1041 on 15399 degrees of freedom
## (1445 observations deleted due to missingness)
## Multiple R-squared: 0.2956, Adjusted R-squared: 0.295
## F-statistic: 538.4 on 12 and 15399 DF, p-value: < 2.2e-16
#PARA EL COSTO TOTAL INCURRIDO
df_cti <- data.frame(Gender="Male", processing_time=2080, AverageWeeklyWage= 1126.85, BodyPartRegion="Upper Extremities")
predict(regresion_cti,df_cti)## 1
## 33125.36
#PARA EL TIEMPO DE PROCESAMIENTO
df_pt <- data.frame(Gender="Male", TotalPaid=375.61, AverageWeeklyWage=320, BodyPartRegion="Trunk", ClaimantType="Indemnity")
predict(regresion_pt,df_pt)## 1
## 1550.67
Modelo 1: Resultado = 33125.36 / Real= 3873.13 Modelo 2: Resultado = 1550.67 / Real= 2087
# Extraer Variables de Interés
df_arbol1 <- df_regr[,c("TotalPaid", "processing_time", "Gender", "ClaimantType")]
df_arbol1 <- na.omit(df_arbol1)
# ¿Cúantos NA tengo en la base de datos?
sapply(df_arbol1, function(x) sum(is.na(x)))## TotalPaid processing_time Gender ClaimantType
## 0 0 0 0
df_arbol2 <- df_regr[,c("Gender", "ClaimantType", "ClaimStatus", "BodyPartRegion")]
df_arbol2 <- na.omit(df_arbol2)
# ¿Cúantos NA tengo en la base de datos?
sapply(df_arbol2, function(x) sum(is.na(x)))## Gender ClaimantType ClaimStatus BodyPartRegion
## 0 0 0 0
#ÁRBOL 1
ad1 <- rpart(formula = Gender ~ ., data = df_arbol1)
ad1## n= 15412
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 15412 8115 Male (0.41084869 0.47346224 0.11568907)
## 2) processing_time< 3634.5 14239 7223 Male (0.43781164 0.49273123 0.06945712)
## 4) processing_time< 1004.5 6209 3124 Male (0.48397488 0.49685940 0.01916573)
## 8) processing_time< 218.5 1321 546 Female (0.58667676 0.40651022 0.00681302) *
## 9) processing_time>=218.5 4888 2340 Male (0.45621931 0.52127660 0.02250409) *
## 5) processing_time>=1004.5 8030 4099 Male (0.40211706 0.48953923 0.10834371) *
## 3) processing_time>=3634.5 1173 379 Not Available (0.08354646 0.23955669 0.67689685) *
rpart.plot(ad1)prp(ad1)#ÁRBOL 2
ad2 <- rpart(formula = Gender ~ ., data = df_arbol2)
ad2## n= 16857
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 16857 8834 Male (0.417452690 0.475944711 0.106602598)
## 2) BodyPartRegion=Head,Lower Extremities,Multiple Body Parts,Neck,Trunk,Upper Extremities 14540 7040 Male (0.479229711 0.515818432 0.004951857)
## 4) BodyPartRegion=Multiple Body Parts 1889 852 Female (0.548967708 0.449973531 0.001058761) *
## 5) BodyPartRegion=Head,Lower Extremities,Neck,Trunk,Upper Extremities 12651 6001 Male (0.468816694 0.525650146 0.005533159) *
## 3) BodyPartRegion=Non-Standard Code 2317 592 Not Available (0.029779888 0.225722918 0.744497195) *
rpart.plot(ad2)prp(ad2)df_clus <- df_regr
sapply(df_clus, function(x) sum(is.na(x)))## ClaimID TotalPaid TotalReserves
## 0 0 0
## TotalRecovery IndemnityPaid OtherPaid
## 0 0 0
## ClaimStatus IncidentDate ReturnToWorkDate
## 0 0 5605
## AverageWeeklyWage ClaimantOpenedDate ClaimantClosedDate
## 0 0 1445
## EmployerNotificationDate ReceivedDate IsDenied
## 3971 0 0
## ClaimantAge_at_DOI Gender ClaimantType
## 5366 0 0
## InjuryNature BodyPartRegion BodyPart
## 0 0 0
## BillReviewALE Hospital PhysicianOutpatient
## 14035 14200 10226
## Rx processing_time
## 14338 1445
summary(df_clus)## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Min. : 26.88 Min. : 0 Min. : 0.0
## 1st Qu.: 814288 1st Qu.: 3330.54 1st Qu.: 0 1st Qu.: 0.0
## Median : 850173 Median : 11847.95 Median : 0 Median : 0.0
## Mean :10434983 Mean : 23907.09 Mean : 4933 Mean : 307.4
## 3rd Qu.:22717577 3rd Qu.: 31129.08 3rd Qu.: 0 3rd Qu.: 0.0
## Max. :62032209 Max. :149949.17 Max. :659235 Max. :85000.0
##
## IndemnityPaid OtherPaid ClaimStatus
## Min. : 15.5 Min. : -7820 Length:16857
## 1st Qu.: 1079.4 1st Qu.: 1413 Class :character
## Median : 4664.3 Median : 5614 Mode :character
## Mean : 11742.7 Mean : 12164
## 3rd Qu.: 13089.5 3rd Qu.: 16237
## Max. :147420.0 Max. :137142
##
## IncidentDate ReturnToWorkDate
## Min. :1971-11-24 00:00:00.00 Min. :1984-01-01 00:00:00.00
## 1st Qu.:1998-07-18 00:00:00.00 1st Qu.:2000-10-19 18:00:00.00
## Median :2003-10-09 00:00:00.00 Median :2005-06-28 00:00:00.00
## Mean :2003-05-21 23:48:48.55 Mean :2005-03-26 13:16:08.78
## 3rd Qu.:2008-10-20 00:00:00.00 3rd Qu.:2009-11-18 00:00:00.00
## Max. :2014-06-18 00:00:00.00 Max. :2014-06-19 00:00:00.00
## NA's :5605
## AverageWeeklyWage ClaimantOpenedDate
## Min. : 1.0 Min. :1971-11-24 00:00:00.00
## 1st Qu.: 467.0 1st Qu.:1998-09-22 00:00:00.00
## Median : 592.9 Median :2003-11-18 00:00:00.00
## Mean : 592.9 Mean :2003-07-29 11:06:44.26
## 3rd Qu.: 638.4 3rd Qu.:2009-03-11 00:00:00.00
## Max. :15062.3 Max. :2014-06-24 00:00:00.00
##
## ClaimantClosedDate EmployerNotificationDate
## Min. :1999-07-12 00:00:00.00 Min. :1978-04-07 00:00:00.00
## 1st Qu.:2005-03-31 00:00:00.00 1st Qu.:2000-06-21 00:00:00.00
## Median :2006-03-31 00:00:00.00 Median :2005-04-09 00:00:00.00
## Mean :2007-03-24 08:30:53.72 Mean :2005-08-15 23:35:58.44
## 3rd Qu.:2010-03-29 00:00:00.00 3rd Qu.:2009-11-04 18:00:00.00
## Max. :2014-06-30 00:00:00.00 Max. :9999-07-01 00:00:00.00
## NA's :1445 NA's :3971
## ReceivedDate IsDenied ClaimantAge_at_DOI
## Min. :1971-11-24 00:00:00.00 Length:16857 Min. :-5973.00
## 1st Qu.:1998-09-22 00:00:00.00 Class :character 1st Qu.: 36.00
## Median :2003-11-19 00:00:00.00 Mode :character Median : 45.00
## Mean :2003-10-25 04:58:07.87 Mean : 43.29
## 3rd Qu.:2008-11-12 00:00:00.00 3rd Qu.: 53.00
## Max. :4996-02-17 00:00:00.00 Max. : 87.00
## NA's :5366
## Gender ClaimantType InjuryNature BodyPartRegion
## Length:16857 Length:16857 Length:16857 Length:16857
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BodyPart BillReviewALE Hospital PhysicianOutpatient
## Length:16857 Min. : 0.00 Min. : 0 Min. : -18.8
## Class :character 1st Qu.: 8.25 1st Qu.: 187 1st Qu.: 105.0
## Mode :character Median : 24.00 Median : 536 Median : 216.9
## Mean : 179.20 Mean : 4292 Mean : 1662.5
## 3rd Qu.: 64.00 3rd Qu.: 2075 3rd Qu.: 656.3
## Max. :15836.77 Max. :408665 Max. :318039.6
## NA's :14035 NA's :14200 NA's :10226
## Rx processing_time
## Min. : 0.00 Min. : 7
## 1st Qu.: 21.70 1st Qu.: 587
## Median : 56.38 Median : 1290
## Mean : 1326.86 Mean : 1598
## 3rd Qu.: 177.70 3rd Qu.: 2382
## Max. :187627.17 Max. :10830
## NA's :14338 NA's :1445
df_clus <- df_clus[, c("TotalPaid", "TotalRecovery", "IndemnityPaid", "OtherPaid", "AverageWeeklyWage", "ClaimantAge_at_DOI", "processing_time")]
summary(df_clus)## TotalPaid TotalRecovery IndemnityPaid OtherPaid
## Min. : 26.88 Min. : 0.0 Min. : 15.5 Min. : -7820
## 1st Qu.: 3330.54 1st Qu.: 0.0 1st Qu.: 1079.4 1st Qu.: 1413
## Median : 11847.95 Median : 0.0 Median : 4664.3 Median : 5614
## Mean : 23907.09 Mean : 307.4 Mean : 11742.7 Mean : 12164
## 3rd Qu.: 31129.08 3rd Qu.: 0.0 3rd Qu.: 13089.5 3rd Qu.: 16237
## Max. :149949.17 Max. :85000.0 Max. :147420.0 Max. :137142
##
## AverageWeeklyWage ClaimantAge_at_DOI processing_time
## Min. : 1.0 Min. :-5973.00 Min. : 7
## 1st Qu.: 467.0 1st Qu.: 36.00 1st Qu.: 587
## Median : 592.9 Median : 45.00 Median : 1290
## Mean : 592.9 Mean : 43.29 Mean : 1598
## 3rd Qu.: 638.4 3rd Qu.: 53.00 3rd Qu.: 2382
## Max. :15062.3 Max. : 87.00 Max. :10830
## NA's :5366 NA's :1445
df_clus <- na.omit(df_clus)
sapply(df_clus, function(x) sum(is.na(x)))## TotalPaid TotalRecovery IndemnityPaid OtherPaid
## 0 0 0 0
## AverageWeeklyWage ClaimantAge_at_DOI processing_time
## 0 0 0
#Borramos dos filas debido al error que apareció en el siguiente chunk
#Error in data.frame(..., check.names = FALSE) :
#arguments imply differing number of rows: 3768, 10046
#df_clus <- df_clus[-3768, ]
#df_clus <- df_clus[-10044, ]#O. normalizar variables
#df_clus <- as.data.frame(scale(df_clus))
# 1. Crear base de datos
df_clus## # A tibble: 10,046 × 7
## TotalPaid TotalRecovery IndemnityPaid OtherPaid AverageWeeklyWage
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 11948. 0 244. 11704. 640.
## 2 43108. 0 40000 3108. 539
## 3 1990. 0 605. 1385. 756.
## 4 5407. 0 2246. 3161. 328.
## 5 32606. 0 20000 12606. 512.
## 6 633. 0 365. 268. 767.
## 7 114251. 0 44271. 69980. 431.
## 8 21020. 0 9344. 11676. 798.
## 9 78853. 0 35239. 43614. 445.
## 10 3001. 0 2020. 981. 421.
## # ℹ 10,036 more rows
## # ℹ 2 more variables: ClaimantAge_at_DOI <dbl>, processing_time <dbl>
# 2. Determinar el número de grupos
grupos <- 8
# 3. Realizar la clasificación
segmentos <- kmeans(df_clus,grupos)
# 4. Revisar la asignación de grupos
asignacion <- cbind(df_clus, cluster=segmentos$cluster)
# 5. Graficar asignaciones
# install.packages("ggplot2")
library(ggplot2)
# install.packages("factoextra")
library(factoextra)
fviz_cluster(segmentos,data=df_clus)# 6. Optimizar la cantidad de grupos
#Se trababa al hacer la optimización pero el código era el siguiente:
#set.seed(123)
#optimizacion <- clusGap(df_clus, FUN = kmeans, nstart = 1, K.max = 5)
#plot(optimizacion, xlab = "Número de clusters K")
# El punto más alto de la gráfica indica la cantidad de grupos óptimo