library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.3 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## 1. Importar la base de datos
#file.choose()
df <- read.csv("/Users/alejandromontano/Desktop/Claims_limpia.csv")
## 2. Entender la base de datos
summary(df)
## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Min. : -270 Min. : 0 Min. : 0.00
## 1st Qu.: 811125 1st Qu.: 60 1st Qu.: 0 1st Qu.: 0.00
## Median : 844626 Median : 235 Median : 0 Median : 0.00
## Mean :10149151 Mean : 6746 Mean : 2233 Mean : 68.88
## 3rd Qu.:22716506 3rd Qu.: 938 3rd Qu.: 0 3rd Qu.: 0.00
## Max. :62203891 Max. :4527291 Max. :2069575 Max. :130541.03
##
## IndemnityPaid OtherPaid ClaimStatus IncidentDate
## Min. : -475 Min. : -7820 Length:134004 Length:134004
## 1st Qu.: 0 1st Qu.: 58 Class :character Class :character
## Median : 0 Median : 230 Mode :character Mode :character
## Mean : 3061 Mean : 3685
## 3rd Qu.: 0 3rd Qu.: 855
## Max. :640732 Max. :4129915
##
## IncidentDescription AverageWeeklyWage ReceivedDate IsDenied
## Length:134004 Length:134004 Length:134004 Min. :0.00000
## Class :character Class :character Class :character 1st Qu.:0.00000
## Mode :character Mode :character Mode :character Median :0.00000
## Mean :0.04474
## 3rd Qu.:0.00000
## Max. :1.00000
##
## Gender ClaimantType InjuryNature BodyPartRegion
## Length:134004 Length:134004 Length:134004 Length:134004
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BodyPart semana ClaimantClosedDate2 ClaimantOpenedDate2
## Length:134004 Min. : 0.00 Min. : 1.00 Min. : 1.00
## Class :character 1st Qu.:13.00 1st Qu.:13.00 1st Qu.:14.00
## Mode :character Median :25.00 Median :14.00 Median :26.00
## Mean :25.62 Mean :21.55 Mean :25.95
## 3rd Qu.:38.00 3rd Qu.:30.00 3rd Qu.:38.00
## Max. :53.00 Max. :53.00 Max. :53.00
## NA's :58637 NA's :4678
## EmployerNotificationDate2 ReturnToWorkDate2 Total_Incurred_Cost_Claim
## Min. : 1.00 Min. : 1.00 Min. : -11775
## 1st Qu.:14.00 1st Qu.:14.00 1st Qu.: 59
## Median :25.00 Median :26.00 Median : 234
## Mean :25.77 Mean :26.02 Mean : 8910
## 3rd Qu.:38.00 3rd Qu.:38.00 3rd Qu.: 965
## Max. :53.00 Max. :53.00 Max. :5054823
## NA's :22288 NA's :58637
df <- df %>%
filter(complete.cases(ReturnToWorkDate2, ClaimantClosedDate2))
summary(df)
## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Min. : -270.3 Min. :0 Min. : 0.00
## 1st Qu.: 820013 1st Qu.: 45.6 1st Qu.:0 1st Qu.: 0.00
## Median : 842520 Median : 252.7 Median :0 Median : 0.00
## Mean : 9072170 Mean : 4232.0 Mean :0 Mean : 55.52
## 3rd Qu.:11042817 3rd Qu.: 1139.4 3rd Qu.:0 3rd Qu.: 0.00
## Max. :62203889 Max. :808619.3 Max. :0 Max. :80049.48
##
## IndemnityPaid OtherPaid ClaimStatus IncidentDate
## Min. : -475 Min. : -270.3 Length:71934 Length:71934
## 1st Qu.: 0 1st Qu.: 40.0 Class :character Class :character
## Median : 0 Median : 246.2 Mode :character Mode :character
## Mean : 1801 Mean : 2431.2
## 3rd Qu.: 0 3rd Qu.: 1020.8
## Max. :371920 Max. :745818.7
##
## IncidentDescription AverageWeeklyWage ReceivedDate IsDenied
## Length:71934 Length:71934 Length:71934 Min. :0.00000
## Class :character Class :character Class :character 1st Qu.:0.00000
## Mode :character Mode :character Mode :character Median :0.00000
## Mean :0.04654
## 3rd Qu.:0.00000
## Max. :1.00000
##
## Gender ClaimantType InjuryNature BodyPartRegion
## Length:71934 Length:71934 Length:71934 Length:71934
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BodyPart semana ClaimantClosedDate2 ClaimantOpenedDate2
## Length:71934 Min. : 0.00 Min. : 1.00 Min. : 1.00
## Class :character 1st Qu.:14.00 1st Qu.:13.00 1st Qu.:14.00
## Mode :character Median :26.00 Median :15.00 Median :26.00
## Mean :25.78 Mean :22.47 Mean :26.17
## 3rd Qu.:38.00 3rd Qu.:35.00 3rd Qu.:38.00
## Max. :53.00 Max. :53.00 Max. :53.00
##
## EmployerNotificationDate2 ReturnToWorkDate2 Total_Incurred_Cost_Claim
## Min. : 1.00 Min. : 1.00 Min. :-10681.7
## 1st Qu.:14.00 1st Qu.:14.00 1st Qu.: 39.7
## Median :26.00 Median :26.00 Median : 246.8
## Mean :26.09 Mean :26.18 Mean : 4176.5
## 3rd Qu.:38.00 3rd Qu.:38.00 3rd Qu.: 1100.8
## Max. :53.00 Max. :53.00 Max. :806695.1
## NA's :316
df$ClaimantClosedDate2 <- as.numeric(df$ClaimantClosedDate2)
df$ReturnToWorkDate2 <- as.numeric(df$ReturnToWorkDate2)
df$AverageWeeklyWage <- as.numeric(df$AverageWeeklyWage)
## Warning: NAs introduced by coercion
regresion <- lm(Total_Incurred_Cost_Claim ~ ReturnToWorkDate2 + AverageWeeklyWage + ClaimantOpenedDate2 + ClaimantClosedDate2 + IsDenied, data=df)
summary(regresion)
##
## Call:
## lm(formula = Total_Incurred_Cost_Claim ~ ReturnToWorkDate2 +
## AverageWeeklyWage + ClaimantOpenedDate2 + ClaimantClosedDate2 +
## IsDenied, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19409 -7645 -6707 -2096 738738
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.521e+03 3.550e+02 26.818 < 2e-16 ***
## ReturnToWorkDate2 3.045e+00 1.134e+01 0.269 0.7883
## AverageWeeklyWage 8.016e-04 1.014e-02 0.079 0.9370
## ClaimantOpenedDate2 -5.054e+01 1.145e+01 -4.414 1.02e-05 ***
## ClaimantClosedDate2 -1.732e+01 8.783e+00 -1.972 0.0486 *
## IsDenied -6.063e+03 4.871e+02 -12.448 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25020 on 38585 degrees of freedom
## (33343 observations deleted due to missingness)
## Multiple R-squared: 0.004913, Adjusted R-squared: 0.004784
## F-statistic: 38.1 on 5 and 38585 DF, p-value: < 2.2e-16
regresion <- lm(Total_Incurred_Cost_Claim ~ ClaimantOpenedDate2 + ClaimantClosedDate2 + IsDenied, data=df)
summary(regresion)
##
## Call:
## lm(formula = Total_Incurred_Cost_Claim ~ ClaimantOpenedDate2 +
## ClaimantClosedDate2 + IsDenied, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15310 -4311 -3818 -2681 802647
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4875.974 183.772 26.533 < 2e-16 ***
## ClaimantOpenedDate2 -26.518 4.908 -5.403 6.57e-08 ***
## ClaimantClosedDate2 5.417 4.895 1.107 0.268
## IsDenied -2736.445 336.147 -8.141 4.00e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18990 on 71930 degrees of freedom
## Multiple R-squared: 0.001343, Adjusted R-squared: 0.001301
## F-statistic: 32.24 on 3 and 71930 DF, p-value: < 2.2e-16