bd1 <- read.csv("/Users/santiago/Downloads/ClaimsData2018.csv")
bd2 <- read.csv("/Users/santiago/Downloads/TransactionsSummary2018.csv")
bd <- merge(bd1, bd2, by = "ClaimID", all = TRUE)
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 633915 Min. : -270 Min. : 0 Min. : 0.00
## 1st Qu.: 810246 1st Qu.: 60 1st Qu.: 0 1st Qu.: 0.00
## Median : 856915 Median : 235 Median : 0 Median : 0.00
## Mean :12344572 Mean : 6746 Mean : 2233 Mean : 68.88
## 3rd Qu.:22716420 3rd Qu.: 938 3rd Qu.: 0 3rd Qu.: 0.00
## Max. :62246496 Max. :4527291 Max. :2069575 Max. :130541.03
## NA's :52673 NA's :52673 NA's :52673
## IndemnityPaid OtherPaid ClaimStatus IncidentDate
## Min. : -475 Min. : -7820 Length:186677 Length:186677
## 1st Qu.: 0 1st Qu.: 58 Class :character Class :character
## Median : 0 Median : 230 Mode :character Mode :character
## Mean : 3061 Mean : 3685
## 3rd Qu.: 0 3rd Qu.: 855
## Max. :640732 Max. :4129915
## NA's :52673 NA's :52673
## IncidentDescription ReturnToWorkDate AverageWeeklyWage ClaimantOpenedDate
## Length:186677 Length:186677 Length:186677 Length:186677
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## ClaimantClosedDate EmployerNotificationDate ReceivedDate IsDenied
## Length:186677 Length:186677 Length:186677 Min. :0.00
## Class :character Class :character Class :character 1st Qu.:0.00
## Mode :character Mode :character Mode :character Median :0.00
## Mean :0.04
## 3rd Qu.:0.00
## Max. :1.00
## NA's :52673
## ClaimantAge_at_DOI Gender ClaimantType InjuryNature
## Length:186677 Length:186677 Length:186677 Length:186677
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BodyPartRegion BodyPart X X.1
## Length:186677 Length:186677 Mode:logical Mode:logical
## Class :character Class :character NA's:186677 NA's:186677
## Mode :character Mode :character
##
##
##
##
## X.2 X.3 X.4 X.5 X.6
## Mode:logical Mode:logical Mode:logical Mode:logical Mode:logical
## NA's:186677 NA's:186677 NA's:186677 NA's:186677 NA's:186677
##
##
##
##
##
## X.7 X.8 X.9 X.10 X.11
## Mode:logical Mode:logical Mode:logical Mode:logical Mode:logical
## NA's:186677 NA's:186677 NA's:186677 NA's:186677 NA's:186677
##
##
##
##
##
## X.12 X.13 X.14 X.15 X.16
## Mode:logical Mode:logical Mode:logical Mode:logical Mode:logical
## NA's:186677 NA's:186677 NA's:186677 NA's:186677 NA's:186677
##
##
##
##
##
## X.17 X.18 X.19 X.20 X.21
## Mode:logical Mode:logical Mode:logical Mode:logical Mode:logical
## NA's:186677 NA's:186677 NA's:186677 NA's:186677 NA's:186677
##
##
##
##
##
## X.22 BillReviewALE Hospital PhysicianOutpatient
## Mode:logical Min. : -456.0 Min. : -12570.4 Min. : -4655.7
## NA's:186677 1st Qu.: 16.0 1st Qu.: 193.9 1st Qu.: 107.6
## Median : 32.0 Median : 559.1 Median : 221.6
## Mean : 191.2 Mean : 4394.7 Mean : 1752.3
## 3rd Qu.: 80.0 3rd Qu.: 2253.4 3rd Qu.: 710.5
## Max. :56475.3 Max. :2759604.0 Max. :1481468.5
## NA's :139865 NA's :145262 NA's :84986
## Rx
## Min. : -469.5
## 1st Qu.: 23.3
## Median : 58.3
## Mean : 1140.4
## 3rd Qu.: 174.5
## Max. :631635.5
## NA's :145752
## 'data.frame': 186677 obs. of 49 variables:
## $ ClaimID : int 633915 633917 633918 633919 633920 633929 650915 650916 650917 650918 ...
## $ TotalPaid : num NA NA NA NA NA ...
## $ TotalReserves : num NA NA NA NA NA NA 0 0 0 0 ...
## $ TotalRecovery : num NA NA NA NA NA NA 0 0 0 0 ...
## $ IndemnityPaid : num NA NA NA NA NA ...
## $ OtherPaid : num NA NA NA NA NA ...
## $ ClaimStatus : chr NA NA NA NA ...
## $ IncidentDate : chr NA NA NA NA ...
## $ IncidentDescription : chr NA NA NA NA ...
## $ ReturnToWorkDate : chr NA NA NA NA ...
## $ AverageWeeklyWage : chr NA NA NA NA ...
## $ ClaimantOpenedDate : chr NA NA NA NA ...
## $ ClaimantClosedDate : chr NA NA NA NA ...
## $ EmployerNotificationDate: chr NA NA NA NA ...
## $ ReceivedDate : chr NA NA NA NA ...
## $ IsDenied : int NA NA NA NA NA NA 0 0 0 0 ...
## $ ClaimantAge_at_DOI : chr NA NA NA NA ...
## $ Gender : chr NA NA NA NA ...
## $ ClaimantType : chr NA NA NA NA ...
## $ InjuryNature : chr NA NA NA NA ...
## $ BodyPartRegion : chr NA NA NA NA ...
## $ BodyPart : chr NA NA NA NA ...
## $ X : logi NA NA NA NA NA NA ...
## $ X.1 : logi NA NA NA NA NA NA ...
## $ X.2 : logi NA NA NA NA NA NA ...
## $ X.3 : logi NA NA NA NA NA NA ...
## $ X.4 : logi NA NA NA NA NA NA ...
## $ X.5 : logi NA NA NA NA NA NA ...
## $ X.6 : logi NA NA NA NA NA NA ...
## $ X.7 : logi NA NA NA NA NA NA ...
## $ X.8 : logi NA NA NA NA NA NA ...
## $ X.9 : logi NA NA NA NA NA NA ...
## $ X.10 : logi NA NA NA NA NA NA ...
## $ X.11 : logi NA NA NA NA NA NA ...
## $ X.12 : logi NA NA NA NA NA NA ...
## $ X.13 : logi NA NA NA NA NA NA ...
## $ X.14 : logi NA NA NA NA NA NA ...
## $ X.15 : logi NA NA NA NA NA NA ...
## $ X.16 : logi NA NA NA NA NA NA ...
## $ X.17 : logi NA NA NA NA NA NA ...
## $ X.18 : logi NA NA NA NA NA NA ...
## $ X.19 : logi NA NA NA NA NA NA ...
## $ X.20 : logi NA NA NA NA NA NA ...
## $ X.21 : logi NA NA NA NA NA NA ...
## $ X.22 : logi NA NA NA NA NA NA ...
## $ BillReviewALE : num 437.6 311.3 74.7 231.3 24.8 ...
## $ Hospital : num 2035 6540 721 107 NA ...
## $ PhysicianOutpatient : num 4742 2247 231 1078 251 ...
## $ Rx : num 65 0 NA 1629.2 56.5 ...
#count(bd, TotalPaid, sort=TRUE)
#count(bd, ClaimStatus, sort=TRUE)
#count(bd, TotalReserves, sort=TRUE)
#count(bd, TotalRecovery, sort=TRUE)
#count(bd, IndemnityPaid, sort=TRUE)
#count(bd, IncidentDate, sort=TRUE)
#count(bd, OtherPaid, sort=TRUE)
#count(bd, IncidentDescription, sort=TRUE)
#count(bd, AverageWeeklyWage, sort=TRUE)
#count(bd, IsDenied, sort=TRUE)
#count(bd, ClaimantAge_at_DOI, sort=TRUE)
#count(bd, Gender, sort=TRUE)
#count(bd, ClaimantType, sort=TRUE)
#count(bd, InjuryNature, sort=TRUE)
#count(bd, BodyPartRegion, sort=TRUE)
#count(bd, BodyPart, sort=TRUE)
# Convertir a fecha.
bd$ClaimantOpenedDate <- as.Date(bd$ClaimantOpenedDate, format = "%d/%m/%y")
bd$ClaimantClosedDate <- as.Date(bd$ClaimantClosedDate, format = "%d/%m/%y")
# Convertir a número.
bd$ClaimantAge_at_DOI <- as.numeric(bd$ClaimantAge_at_DOI)## Warning: NAs introduced by coercion
bd$ClaimantAge_at_DOI[is.na(bd$ClaimantAge_at_DOI)] <- 0
#Crear columnas de Costo Total Incurrido y Tiempo de Procesamiento.
bd$Total_Inc_cost <- bd$TotalReserves + bd$TotalPaid - bd$TotalRecovery
bd$TP <- as.numeric (difftime(bd$ClaimantClosedDate, bd$ClaimantOpenedDate, units= "days"))
#Eliminar columnas irrelevantes.
bd <- subset(bd, select = -c(X:X.22))
#Filtrar solo perosonas mayores a 50 años.
bdm <- subset(bd, bd$ClaimantAge_at_DOI > 50)
summary(bdm)## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650918 Min. : 0.0 Min. : 0 Min. : 0.00
## 1st Qu.: 817778 1st Qu.: 91.6 1st Qu.: 0 1st Qu.: 0.00
## Median : 5044138 Median : 330.0 Median : 0 Median : 0.00
## Mean :14020963 Mean : 10516.2 Mean : 4688 Mean : 72.72
## 3rd Qu.:22732024 3rd Qu.: 1755.2 3rd Qu.: 0 3rd Qu.: 0.00
## Max. :62203889 Max. :1393926.1 Max. :1529053 Max. :130541.03
##
## IndemnityPaid OtherPaid ClaimStatus IncidentDate
## Min. : 0 Min. : 0.0 Length:23872 Length:23872
## 1st Qu.: 0 1st Qu.: 88.3 Class :character Class :character
## Median : 0 Median : 321.7 Mode :character Mode :character
## Mean : 5041 Mean : 5475.2
## 3rd Qu.: 0 3rd Qu.: 1518.5
## Max. :450680 Max. :1244336.7
##
## IncidentDescription ReturnToWorkDate AverageWeeklyWage ClaimantOpenedDate
## Length:23872 Length:23872 Length:23872 Min. :1977-08-08
## Class :character Class :character Class :character 1st Qu.:2005-02-24
## Mode :character Mode :character Mode :character Median :2008-09-03
## Mean :2008-02-21
## 3rd Qu.:2011-09-22
## Max. :2014-06-30
##
## ClaimantClosedDate EmployerNotificationDate ReceivedDate
## Min. :1999-07-12 Length:23872 Length:23872
## 1st Qu.:2006-09-25 Class :character Class :character
## Median :2009-03-23 Mode :character Mode :character
## Mean :2009-05-18
## 3rd Qu.:2011-12-10
## Max. :2014-06-30
## NA's :1774
## IsDenied ClaimantAge_at_DOI Gender ClaimantType
## Min. :0.00000 Min. :51.00 Length:23872 Length:23872
## 1st Qu.:0.00000 1st Qu.:53.00 Class :character Class :character
## Median :0.00000 Median :56.00 Mode :character Mode :character
## Mean :0.06229 Mean :56.61
## 3rd Qu.:0.00000 3rd Qu.:59.00
## Max. :1.00000 Max. :94.00
##
## InjuryNature BodyPartRegion BodyPart BillReviewALE
## Length:23872 Length:23872 Length:23872 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 16.00
## Mode :character Mode :character Mode :character Median : 24.00
## Mean : 204.10
## 3rd Qu.: 64.25
## Max. :32788.96
## NA's :19465
## Hospital PhysicianOutpatient Rx Total_Inc_cost
## Min. : -3587.1 Min. : 0.0 Min. : -38.90 Min. : -2802.8
## 1st Qu.: 209.8 1st Qu.: 105.8 1st Qu.: 21.20 1st Qu.: 93.3
## Median : 605.0 Median : 219.7 Median : 57.09 Median : 340.8
## Mean : 4721.2 Mean : 1706.1 Mean : 1576.65 Mean : 15131.5
## 3rd Qu.: 2180.4 3rd Qu.: 675.5 3rd Qu.: 195.84 3rd Qu.: 1897.8
## Max. :444256.6 Max. :199700.6 Max. :246614.32 Max. :2912173.6
## NA's :20585 NA's :15968 NA's :20990
## TP
## Min. : 0.0
## 1st Qu.: 92.0
## Median : 221.0
## Mean : 565.1
## 3rd Qu.: 844.0
## Max. :12346.0
## NA's :1774
## 'data.frame': 23872 obs. of 28 variables:
## $ ClaimID : int 650918 777666 777694 777704 777708 777709 777740 777768 777820 777875 ...
## $ TotalPaid : num 1026 235 32606 590 294 ...
## $ TotalReserves : num 0 0 0 0 0 0 0 0 0 0 ...
## $ TotalRecovery : num 0 0 0 0 0 0 0 0 0 0 ...
## $ IndemnityPaid : num 0 0 20000 0 0 ...
## $ OtherPaid : num 1026 235 12606 590 294 ...
## $ ClaimStatus : chr "C" "C" "C" "C" ...
## $ IncidentDate : chr "12/06/09" "02/07/99" "06/07/99" "06/07/99" ...
## $ IncidentDescription : chr "Employee was unloading truck using a pallet jack to unload heavy equipment. Heavy load caused him to lose balan"| __truncated__ "EE. States while trying to assist resident to wheelchair-resident slid down to floor. EE felt pain in lt should"| __truncated__ "EE states while stepping out of van his foot slipped causing EE to fall and land on his neck." "EE states while going to her office she stepped ona spot on the floor and fell injuring her lower back" ...
## $ ReturnToWorkDate : chr "12/06/09" "#VALOR!" "#VALOR!" "26/07/99" ...
## $ AverageWeeklyWage : chr "NULL" "NULL" "511.73" "761.24" ...
## $ ClaimantOpenedDate : Date, format: "2009-07-02" "1999-08-20" ...
## $ ClaimantClosedDate : Date, format: "2010-03-29" "2005-03-31" ...
## $ EmployerNotificationDate: chr "23/06/09" "02/07/99" "06/07/99" "06/07/99" ...
## $ ReceivedDate : chr "02/07/09" "19/08/99" "19/07/99" "26/07/99" ...
## $ IsDenied : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ClaimantAge_at_DOI : num 61 54 51 51 57 51 56 56 51 52 ...
## $ Gender : chr "Male" "Female" "Male" "Female" ...
## $ ClaimantType : chr "Medical Only" "Medical Only" "Indemnity" "Indemnity" ...
## $ InjuryNature : chr "Contusion" "Strain" "Sprain" "Contusion" ...
## $ BodyPartRegion : chr "Upper Extremities" "Multiple Body Parts" "Neck" "Trunk" ...
## $ BodyPart : chr "Shoulder(S)" "Multiple Body Parts (Including Body Systems and Body Parts)" "Multiple Neck Injury" "Lower Back Area" ...
## $ BillReviewALE : num NA NA NA NA 3296 ...
## $ Hospital : num NA NA NA 189 NA ...
## $ PhysicianOutpatient : num NA 221.6 84.6 115.7 24323 ...
## $ Rx : num NA NA 51.5 NA 29489.3 ...
## $ Total_Inc_cost : num 1026 235 32606 590 294 ...
## $ TP : num 270 2050 2900 2075 2057 ...
Observaciones.
1. Podemos observar NAs en varias variables.
2. Algunas de las variables de costo requieren limpieza de valores
negativos.
3. Eliminar columnas irrelevantes para simplificar base de datos, por
ejemplo todas las variables que comienzan con X.
4. Algunos valores requieren modificarse de carácter a númerico.
5. Podemos observar datos outliers en varias columnas númericas.
bdm <- bdm
#summary (bdm)
#str(bdm)
#Eliminar NA´s y negativos
sapply(bdm, function(x)sum(is.na(x)))## ClaimID TotalPaid TotalReserves
## 0 0 0
## TotalRecovery IndemnityPaid OtherPaid
## 0 0 0
## ClaimStatus IncidentDate IncidentDescription
## 0 0 0
## ReturnToWorkDate AverageWeeklyWage ClaimantOpenedDate
## 0 0 0
## ClaimantClosedDate EmployerNotificationDate ReceivedDate
## 1774 0 0
## IsDenied ClaimantAge_at_DOI Gender
## 0 0 0
## ClaimantType InjuryNature BodyPartRegion
## 0 0 0
## BodyPart BillReviewALE Hospital
## 0 19465 20585
## PhysicianOutpatient Rx Total_Inc_cost
## 15968 20990 0
## TP
## 1774
bdm <- bdm[!is.na(bdm$TP), ]
bdm <- bdm[bdm$Total_Inc_cost > 0, ]
#Juntar tablas Total_Inc_Cost y TP, y eliminar renglones como ClaimID
cl <- data.frame(ClaimID = bdm$ClaimID, Total_Inc_cost = bdm$Total_Inc_cost, TP = bdm$TP)
rownames(cl) <- cl$ClaimID
cl <- subset(cl, select = -c(ClaimID))
# Eliminar datos fuera de lo normal
# Los rangos fuera de lo normal están fuera de los siguientes límites
# 1. Límite inferior: Q1 - 1.5*IQR
# 2. Límite superior: Q3 + 1.5*IQR
#IQR = rango intercuartil
# Columna Csoto Total Incurrido
IQR_TIC <- IQR(cl$Total_Inc_cost)
IQR_TIC## [1] 1738.305
## Total_Inc_cost TP
## Min. : 1.4 Min. : 2.0
## 1st Qu.: 188.4 1st Qu.: 115.0
## Median : 452.4 Median : 280.0
## Mean : 6695.4 Mean : 606.8
## 3rd Qu.: 1926.7 3rd Qu.: 895.0
## Max. :1393926.1 Max. :12346.0
## [1] -2419.057
## [1] 4534.158
cl <- cl[cl$Total_Inc_cost<=4534.16, ]
# Columna Tiempo de procesamiento
IQR_TP <- IQR(cl$TP)
IQR_TP## [1] 654
## [1] -866
## [1] 1876
# 0. Normalizar variables
cl <- as.data.frame(scale(cl))
# 1. Crear base de datos
bdc <- cl
# 2. Determinar el número de grupos
grupos <- 3
# 3. Realizar ka clasificación
segmentos <- kmeans(bdc, grupos)
# 4. Revisar la asignación de grupos
asignacion <- cbind(bdc, cluster= segmentos$cluster)
# 5. Gráficar resultados
library(ggplot2)
library(factoextra)## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 714650)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 714650)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 714650)
bdm <- bdm
#summary(bdm)
#count(bd, TotalPaid, sort=TRUE)
#count(bd, ClaimStatus, sort=TRUE)
#count(bd, TotalReserves, sort=TRUE)
#count(bd, TotalRecovery, sort=TRUE)
#count(bd, IndemnityPaid, sort=TRUE)
#count(bd, OtherPaid, sort=TRUE)
#count(bd, IncidentDescription, sort=TRUE)
#count(bd, IncidentDate, sort=TRUE)
#count(bd, AverageWeeklyWage, sort=TRUE)
#count(bd, IsDenied, sort=TRUE)
#count(bd, ClaimantAge_at_DOI, sort=TRUE)
#count(bd, Gender, sort=TRUE)
#count(bd, ClaimantType, sort=TRUE)
#count(bd, InjuryNature, sort=TRUE)
#count(bd, BodyPartRegion, sort=TRUE)
#count(bd, BodyPart, sort=TRUE)Costo <- bdm[,c("Total_Inc_cost","TP","Gender","ClaimantAge_at_DOI","AverageWeeklyWage")]
sapply(Costo, function(x)sum(is.na(x)))## Total_Inc_cost TP Gender ClaimantAge_at_DOI
## 0 0 0 0
## AverageWeeklyWage
## 0
#Eliminar NAs y Negativps
Costo <- Costo[!is.na(Costo$TP), ]
Costo <- Costo[Costo$Total_Inc_cost > 0, ]
Costo <- Costo[!is.na(Costo$AverageWeeklyWage), ]
#Cambiar a númericos y factores
Costo$AverageWeeklyWage <- as.numeric(Costo$AverageWeeklyWage)## Warning: NAs introduced by coercion
Costo$Total_Inc_cost <- as.numeric(Costo$Total_Inc_cost)
Costo$Gender <- as.factor(Costo$Gender)
summary(Costo)## Total_Inc_cost TP Gender ClaimantAge_at_DOI
## Min. : 1.4 Min. : 2.0 Female :8685 Min. :51.00
## 1st Qu.: 188.4 1st Qu.: 115.0 Male :8999 1st Qu.:53.00
## Median : 452.4 Median : 280.0 Not Available: 87 Median :56.00
## Mean : 6695.4 Mean : 606.8 Mean :56.55
## 3rd Qu.: 1926.7 3rd Qu.: 895.0 3rd Qu.:59.00
## Max. :1393926.1 Max. :12346.0 Max. :94.00
##
## AverageWeeklyWage
## Min. : 0.0
## 1st Qu.: 361.8
## Median : 557.2
## Mean : 778.2
## 3rd Qu.: 766.2
## Max. :1377252.0
## NA's :9043
library(rpart)
library(rpart.plot)
arbol1 <- rpart(formula = Total_Inc_cost ~ ., data = Costo)
arbol1## n= 17771
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 17771 1.463323e+13 6695.415
## 2) TP< 470.5 10744 2.847575e+11 1714.490 *
## 3) TP>=470.5 7027 1.367436e+13 14311.050
## 6) AverageWeeklyWage< 341.625 520 1.334002e+11 6668.768 *
## 7) AverageWeeklyWage>=341.625 6507 1.350817e+13 14921.770
## 14) TP< 4659.5 6496 1.307050e+13 14679.880
## 28) AverageWeeklyWage< 600.19 2819 4.452597e+12 14343.640 *
## 29) AverageWeeklyWage>=600.19 3677 8.617344e+12 14937.660
## 58) TP< 1285.5 3364 2.846789e+12 10241.660 *
## 59) TP>=1285.5 313 4.899067e+12 65408.420 *
## 15) TP>=4659.5 11 2.128172e+11 157771.000 *
## n= 17771
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 17771 9232038000 606.8171
## 2) Total_Inc_cost< 6205.98 15246 6964714000 533.5923 *
## 3) Total_Inc_cost>=6205.98 2525 1691987000 1048.9500
## 6) Total_Inc_cost< 105578.2 2302 1211331000 979.1538 *
## 7) Total_Inc_cost>=105578.2 223 353679600 1769.4440 *
#Eliminar datos atípicos y NAs
bdr <- bdr[bdr$Total_Inc_cost > 0 & bdr$Total_Inc_cost < 500000, ]
bdr <- bdr[!is.na(bdr$TP), ]
bdr$AverageWeeklyWage <- as.numeric(bdr$AverageWeeklyWage)## Warning: NAs introduced by coercion
## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 777694 Min. : 3.0 Min. :0 Min. : 0.00
## 1st Qu.: 818222 1st Qu.: 259.6 1st Qu.:0 1st Qu.: 0.00
## Median : 834128 Median : 1046.2 Median :0 Median : 0.00
## Mean : 9809087 Mean : 10677.7 Mean :0 Mean : 82.56
## 3rd Qu.:17141767 3rd Qu.: 5747.6 3rd Qu.:0 3rd Qu.: 0.00
## Max. :61546646 Max. :392024.1 Max. :0 Max. :65193.32
##
## IndemnityPaid OtherPaid ClaimStatus IncidentDate
## Min. : 0 Min. : 0.0 Length:8721 Length:8721
## 1st Qu.: 0 1st Qu.: 248.3 Class :character Class :character
## Median : 0 Median : 880.8 Mode :character Mode :character
## Mean : 5320 Mean : 5358.1
## 3rd Qu.: 923 3rd Qu.: 3696.6
## Max. :371920 Max. :282370.3
##
## IncidentDescription ReturnToWorkDate AverageWeeklyWage
## Length:8721 Length:8721 Min. : 0.0
## Class :character Class :character 1st Qu.: 361.0
## Mode :character Mode :character Median : 557.0
## Mean : 778.2
## 3rd Qu.: 766.2
## Max. :1377252.0
##
## ClaimantOpenedDate ClaimantClosedDate EmployerNotificationDate
## Min. :1994-12-27 Min. :2005-03-31 Length:8721
## 1st Qu.:2006-03-21 1st Qu.:2008-02-15 Class :character
## Median :2009-02-26 Median :2009-12-11 Mode :character
## Mean :2008-08-06 Mean :2009-12-28
## 3rd Qu.:2011-05-25 3rd Qu.:2012-04-23
## Max. :2014-06-06 Max. :2014-06-30
##
## ReceivedDate IsDenied ClaimantAge_at_DOI Gender
## Length:8721 Min. :0.00000 Min. :51.00 Length:8721
## Class :character 1st Qu.:0.00000 1st Qu.:53.00 Class :character
## Mode :character Median :0.00000 Median :56.00 Mode :character
## Mean :0.08439 Mean :56.74
## 3rd Qu.:0.00000 3rd Qu.:59.00
## Max. :1.00000 Max. :91.00
##
## ClaimantType InjuryNature BodyPartRegion BodyPart
## Length:8721 Length:8721 Length:8721 Length:8721
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BillReviewALE Hospital PhysicianOutpatient Rx
## Min. : 0.00 Min. : -3587.1 Min. : 0.0 Min. : 0.00
## 1st Qu.: 16.00 1st Qu.: 189.1 1st Qu.: 103.1 1st Qu.: 20.77
## Median : 24.57 Median : 532.3 Median : 210.4 Median : 52.66
## Mean : 213.92 Mean : 4455.6 Mean : 1571.5 Mean : 1288.60
## 3rd Qu.: 65.40 3rd Qu.: 1980.5 3rd Qu.: 614.7 3rd Qu.: 171.72
## Max. :32788.96 Max. :444256.6 Max. :173447.4 Max. :207509.43
## NA's :7086 NA's :7391 NA's :5494 NA's :7565
## Total_Inc_cost TP
## Min. : 3.0 Min. : 3.0
## 1st Qu.: 253.3 1st Qu.: 111.0
## Median : 1016.3 Median : 234.0
## Mean : 10595.2 Mean : 508.8
## 3rd Qu.: 5701.9 3rd Qu.: 642.0
## Max. :392024.1 Max. :6758.0
##
regresion <- lm(Total_Inc_cost ~ AverageWeeklyWage + TP + BodyPartRegion + ClaimantType + Gender, data = bdr )
summary(regresion)##
## Call:
## lm(formula = Total_Inc_cost ~ AverageWeeklyWage + TP + BodyPartRegion +
## ClaimantType + Gender, data = bdr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98566 -10539 257 2759 352491
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.053e+04 1.213e+03 8.681 < 2e-16 ***
## AverageWeeklyWage 5.480e-03 2.016e-02 0.272 0.785771
## TP 1.431e+01 4.982e-01 28.719 < 2e-16 ***
## BodyPartRegionLower Extremities -3.717e+02 1.194e+03 -0.311 0.755515
## BodyPartRegionMultiple Body Parts -1.287e+03 1.377e+03 -0.935 0.349983
## BodyPartRegionNeck -1.060e+03 1.759e+03 -0.602 0.546871
## BodyPartRegionNon-Standard Code 1.942e+03 4.391e+03 0.442 0.658330
## BodyPartRegionTrunk -1.452e+03 1.277e+03 -1.137 0.255453
## BodyPartRegionUpper Extremities -7.707e+02 1.177e+03 -0.655 0.512504
## ClaimantTypeMedical Only -1.395e+04 6.500e+02 -21.456 < 2e-16 ***
## ClaimantTypeReport Only -1.480e+04 1.978e+04 -0.748 0.454334
## GenderMale 2.139e+03 6.085e+02 3.515 0.000442 ***
## GenderNot Available 8.816e+02 4.500e+03 0.196 0.844676
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27960 on 8708 degrees of freedom
## Multiple R-squared: 0.1922, Adjusted R-squared: 0.1911
## F-statistic: 172.6 on 12 and 8708 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = Total_Inc_cost ~ TP + ClaimantType + Gender, data = bdr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98696 -10518 414 2808 352957
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.798e+03 6.358e+02 15.410 < 2e-16 ***
## TP 1.424e+01 4.942e-01 28.812 < 2e-16 ***
## ClaimantTypeMedical Only -1.392e+04 6.482e+02 -21.470 < 2e-16 ***
## ClaimantTypeReport Only -1.506e+04 1.977e+04 -0.762 0.446356
## GenderMale 2.101e+03 6.031e+02 3.484 0.000496 ***
## GenderNot Available 8.289e+02 4.497e+03 0.184 0.853778
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27950 on 8715 degrees of freedom
## Multiple R-squared: 0.1919, Adjusted R-squared: 0.1915
## F-statistic: 414 on 5 and 8715 DF, p-value: < 2.2e-16
## 1
## 6028.54
regresion2 <- lm(TP ~ AverageWeeklyWage + Total_Inc_cost + BodyPartRegion + ClaimantType + Gender, data = bdr )
summary(regresion2)##
## Call:
## lm(formula = TP ~ AverageWeeklyWage + Total_Inc_cost + BodyPartRegion +
## ClaimantType + Gender, data = bdr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2193.5 -278.6 -134.1 55.1 5491.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.802e+02 2.426e+01 23.917 < 2e-16 ***
## AverageWeeklyWage -2.946e-04 4.145e-04 -0.711 0.47726
## Total_Inc_cost 6.047e-03 2.106e-04 28.719 < 2e-16 ***
## BodyPartRegionLower Extremities 9.754e+00 2.454e+01 0.397 0.69109
## BodyPartRegionMultiple Body Parts 2.131e+02 2.822e+01 7.549 4.82e-14 ***
## BodyPartRegionNeck -3.736e+01 3.616e+01 -1.033 0.30151
## BodyPartRegionNon-Standard Code -1.421e+02 9.026e+01 -1.575 0.11534
## BodyPartRegionTrunk 9.514e+01 2.624e+01 3.626 0.00029 ***
## BodyPartRegionUpper Extremities 1.420e+01 2.419e+01 0.587 0.55724
## ClaimantTypeMedical Only -3.601e+02 1.316e+01 -27.365 < 2e-16 ***
## ClaimantTypeReport Only -4.027e+02 4.067e+02 -0.990 0.32210
## GenderMale 2.275e+01 1.252e+01 1.817 0.06921 .
## GenderNot Available 1.099e+02 9.251e+01 1.188 0.23496
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 574.8 on 8708 degrees of freedom
## Multiple R-squared: 0.2293, Adjusted R-squared: 0.2282
## F-statistic: 215.9 on 12 and 8708 DF, p-value: < 2.2e-16
regresion2 <- lm(TP ~ Total_Inc_cost + BodyPartRegion + ClaimantType + Gender, data = bdr )
summary(regresion2)##
## Call:
## lm(formula = TP ~ Total_Inc_cost + BodyPartRegion + ClaimantType +
## Gender, data = bdr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2193.1 -278.8 -134.0 54.9 5492.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.802e+02 2.426e+01 23.916 < 2e-16 ***
## Total_Inc_cost 6.047e-03 2.106e-04 28.719 < 2e-16 ***
## BodyPartRegionLower Extremities 9.689e+00 2.454e+01 0.395 0.69302
## BodyPartRegionMultiple Body Parts 2.126e+02 2.821e+01 7.535 5.35e-14 ***
## BodyPartRegionNeck -3.751e+01 3.616e+01 -1.038 0.29951
## BodyPartRegionNon-Standard Code -1.422e+02 9.026e+01 -1.575 0.11527
## BodyPartRegionTrunk 9.514e+01 2.624e+01 3.626 0.00029 ***
## BodyPartRegionUpper Extremities 1.417e+01 2.419e+01 0.586 0.55806
## ClaimantTypeMedical Only -3.602e+02 1.316e+01 -27.375 < 2e-16 ***
## ClaimantTypeReport Only -4.025e+02 4.067e+02 -0.990 0.32227
## GenderMale 2.262e+01 1.252e+01 1.807 0.07080 .
## GenderNot Available 1.099e+02 9.251e+01 1.188 0.23475
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 574.8 on 8709 degrees of freedom
## Multiple R-squared: 0.2293, Adjusted R-squared: 0.2283
## F-statistic: 235.5 on 11 and 8709 DF, p-value: < 2.2e-16