lm() es la función de R para ajustar modelos lineales. Es el modelo estadistico nás básico que existe Y más fácil de interpretar. Para interpretarlo se usa la medida R-cuadrada, que significa quế tan cerca estan 1os datos de la 1ínea de regresión ajustada (Va de 0 a 1, donde 1 es que el node10 explica toda la variabilidad).
# Modelo Predictivo
# Importar base de datos
base_de_datos <- read.csv("C:\\Users\\lcbor\\Downloads\\seguros.csv")
# Entender la base de datos
resumen <- summary(base_de_datos)
resumen
## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 777632 Min. : 0 Min. : 0 Min. : 0.00
## 1st Qu.: 800748 1st Qu.: 83 1st Qu.: 0 1st Qu.: 0.00
## Median : 812128 Median : 271 Median : 0 Median : 0.00
## Mean : 1864676 Mean : 10404 Mean : 3368 Mean : 66.05
## 3rd Qu.: 824726 3rd Qu.: 1122 3rd Qu.: 0 3rd Qu.: 0.00
## Max. :62203364 Max. :4527291 Max. :1529053 Max. :100000.00
##
## IndemnityPaid OtherPaid TotalIncurredCost ClaimStatus
## Min. : 0 Min. : 0 Min. : -10400 Length:31619
## 1st Qu.: 0 1st Qu.: 80 1st Qu.: 80 Class :character
## Median : 0 Median : 265 Median : 266 Mode :character
## Mean : 4977 Mean : 5427 Mean : 13706
## 3rd Qu.: 0 3rd Qu.: 1023 3rd Qu.: 1098
## Max. :640732 Max. :4129915 Max. :4734750
##
## IncidentDate IncidentDescription ReturnToWorkDate ClaimantOpenedDate
## Length:31619 Length:31619 Length:31619 Length:31619
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## ClaimantClosedDate EmployerNotificationDate ReceivedDate
## Length:31619 Length:31619 Length:31619
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## IsDenied Transaction_Time Procesing_Time ClaimantAge_at_DOI
## Min. :0.00000 Min. : 0 Min. : 0.00 Min. :14.0
## 1st Qu.:0.00000 1st Qu.: 211 1st Qu.: 4.00 1st Qu.:33.0
## Median :0.00000 Median : 780 Median : 10.00 Median :42.0
## Mean :0.04463 Mean : 1004 Mean : 62.99 Mean :41.6
## 3rd Qu.:0.00000 3rd Qu.: 1440 3rd Qu.: 24.00 3rd Qu.:50.0
## Max. :1.00000 Max. :16428 Max. :11558.00 Max. :94.0
## NA's :614
## Gender ClaimantType InjuryNature BodyPartRegion
## Length:31619 Length:31619 Length:31619 Length:31619
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BodyPart AverageWeeklyWage1 ClaimID1 BillReviewALE
## Length:31619 Min. : 100.0 Min. : 777632 Min. : -448.0
## Class :character 1st Qu.: 492.0 1st Qu.: 800748 1st Qu.: 16.0
## Mode :character Median : 492.0 Median : 812128 Median : 24.0
## Mean : 536.5 Mean : 1864676 Mean : 188.7
## 3rd Qu.: 492.0 3rd Qu.: 824726 3rd Qu.: 64.1
## Max. :8613.5 Max. :62203364 Max. :46055.3
## NA's :14912
## Hospital PhysicianOutpatient Rx
## Min. : -12570.4 Min. : -549.5 Min. : -160.7
## 1st Qu.: 210.5 1st Qu.: 105.8 1st Qu.: 22.9
## Median : 613.9 Median : 218.0 Median : 61.5
## Mean : 5113.2 Mean : 1813.2 Mean : 1695.2
## 3rd Qu.: 2349.1 3rd Qu.: 680.6 3rd Qu.: 189.0
## Max. :2759604.0 Max. :1219766.6 Max. :631635.5
## NA's :19655 NA's :2329 NA's :20730
str(base_de_datos)
## 'data.frame': 31619 obs. of 30 variables:
## $ ClaimID : int 777632 777646 777651 777661 777666 777668 777673 777676 777681 777694 ...
## $ TotalPaid : num 20 0 5407 288 235 ...
## $ TotalReserves : num 0 0 0 0 0 0 0 0 0 0 ...
## $ TotalRecovery : num 0 0 0 0 0 0 0 0 0 0 ...
## $ IndemnityPaid : num 0 0 2246 0 0 ...
## $ OtherPaid : num 20 0 3161 288 235 ...
## $ TotalIncurredCost : num 20 0 5407 288 235 ...
## $ ClaimStatus : chr "C" "C" "C" "C" ...
## $ IncidentDate : chr "01-jul-99" "01-jul-99" "02-jul-99" "02-jul-99" ...
## $ IncidentDescription : chr "EE. States while going down the steps--EE foot slipped & she fell down to cement on ground--injurher rt. Knee, "| __truncated__ "Neck, arm, wrist and neck... On going for 20 yrs, repetitive, performing ultrasound exam on patients. Repetitiv"| __truncated__ "EE was removing items from freezer when items started to fall - EE had to get out of way of falling boxes- strained back" "EE states the fumes from a truck caused her to getan upset stomach and headache." ...
## $ ReturnToWorkDate : chr "" "" "" "" ...
## $ ClaimantOpenedDate : chr "12-jul-99" "29-sep-05" "14-jul-99" "20-jul-99" ...
## $ ClaimantClosedDate : chr "31-mar-05" "22DEC2005" "31-mar-05" "17-jun-08" ...
## $ EmployerNotificationDate: chr "01-jul-99" "01-jul-99" "02-jul-99" "02-jul-99" ...
## $ ReceivedDate : chr "12-jul-99" "29-sep-05" "13-jul-99" "20-jul-99" ...
## $ IsDenied : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Transaction_Time : int 2089 84 2087 3255 2050 2195 2493 2058 2086 2900 ...
## $ Procesing_Time : int 11 2282 12 18 49 9 8 39 10 13 ...
## $ ClaimantAge_at_DOI : int 49 48 33 29 54 49 38 48 40 51 ...
## $ Gender : chr "Female" "Female" "Female" "Female" ...
## $ ClaimantType : chr "Medical Only" "Indemnity" "Indemnity" "Medical Only" ...
## $ InjuryNature : chr "Laceration" "All Other Specific Injuries, Noc" "Strain" "Respiratory Disorders" ...
## $ BodyPartRegion : chr "Multiple Body Parts" "Multiple Body Parts" "Trunk" "Multiple Body Parts" ...
## $ BodyPart : chr "Multiple Body Parts (Including Body Systems and Body Parts)" "Multiple Body Parts (Including Body Systems and Body Parts)" "Lower Back Area" "Multiple Body Parts (Including Body Systems and Body Parts)" ...
## $ AverageWeeklyWage1 : num 492 673 328 492 492 ...
## $ ClaimID1 : int 777632 777646 777651 777661 777666 777668 777673 777676 777681 777694 ...
## $ BillReviewALE : num NA 592 NA NA NA ...
## $ Hospital : num NA 36342 299 NA NA ...
## $ PhysicianOutpatient : num NA 41773.4 155 65.5 221.6 ...
## $ Rx : num 50.3 3108.5 NA NA NA ...
# Generar regresion (modelo lineal)
regresion <- lm(TotalIncurredCost ~ ClaimStatus + IsDenied + Procesing_Time + ClaimantAge_at_DOI + Gender + ClaimantType + InjuryNature + BodyPartRegion + AverageWeeklyWage1 + BillReviewALE + Hospital + PhysicianOutpatient + Rx, data=base_de_datos)
summary(regresion)
##
## Call:
## lm(formula = TotalIncurredCost ~ ClaimStatus + IsDenied + Procesing_Time +
## ClaimantAge_at_DOI + Gender + ClaimantType + InjuryNature +
## BodyPartRegion + AverageWeeklyWage1 + BillReviewALE + Hospital +
## PhysicianOutpatient + Rx, data = base_de_datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -454179 -3153 24 2285 1063665
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 1.701e+04 6.025e+03
## ClaimStatusO 4.490e+05 1.171e+04
## ClaimStatusR 3.190e+05 1.305e+04
## IsDenied -8.474e+03 4.366e+03
## Procesing_Time 4.647e+01 4.818e+00
## ClaimantAge_at_DOI -2.612e+01 6.833e+01
## GenderMale 3.149e+02 1.538e+03
## GenderNot Available 2.031e+03 1.205e+04
## ClaimantTypeMedical Only -2.477e+04 2.017e+03
## ClaimantTypeReport Only -2.431e+04 4.829e+03
## InjuryNatureAsbestosis 6.366e+03 3.063e+04
## InjuryNatureBurn 1.883e+02 7.904e+03
## InjuryNatureCarpal Tunnel Syndrome -3.840e+03 1.039e+04
## InjuryNatureConcussion 2.100e+04 1.679e+04
## InjuryNatureContagious Disease 5.091e+03 1.674e+04
## InjuryNatureContusion 3.604e+03 4.430e+03
## InjuryNatureCrushing 3.074e+03 1.492e+04
## InjuryNatureDermatitis 4.036e+03 7.050e+03
## InjuryNatureDislocation -3.692e+04 1.423e+04
## InjuryNatureDust Disease, NOC 7.068e+02 4.304e+04
## InjuryNatureElectric Shock 5.065e+03 2.512e+04
## InjuryNatureForeign Body 4.459e+03 6.199e+03
## InjuryNatureFracture 4.540e+03 6.567e+03
## InjuryNatureHearing Loss Or Impairment -8.563e+02 2.521e+04
## InjuryNatureHeat Prostration -1.972e+03 2.513e+04
## InjuryNatureHernia -9.356e+03 2.192e+04
## InjuryNatureInfection -2.409e+03 1.674e+04
## InjuryNatureInflammation 4.660e+03 9.310e+03
## InjuryNatureLaceration 3.311e+03 5.143e+03
## InjuryNatureLoss of Hearing -1.943e+04 2.528e+04
## InjuryNatureMental Stress -1.419e+04 1.978e+04
## InjuryNatureMultiple Physical Injuries Only 1.472e+04 7.683e+03
## InjuryNatureMyocardial Infarction -1.661e+04 3.091e+04
## InjuryNatureNo Physical Injury 3.799e+03 8.442e+03
## InjuryNatureNon-Standard Code -1.644e+03 2.187e+04
## InjuryNaturePoisoning?Chemical (Other Than Metals) 4.469e+01 1.979e+04
## InjuryNaturePuncture 4.340e+03 5.646e+03
## InjuryNatureRespiratory Disorders -2.827e+03 8.525e+03
## InjuryNatureRupture -2.115e+04 3.065e+04
## InjuryNatureSeverance 8.548e+03 3.071e+04
## InjuryNatureSprain -5.494e+03 5.390e+03
## InjuryNatureStrain 4.182e+03 4.543e+03
## InjuryNatureSyncope 3.808e+03 1.968e+04
## BodyPartRegionLower Extremities 4.185e+03 3.199e+03
## BodyPartRegionMultiple Body Parts 6.375e+02 3.339e+03
## BodyPartRegionNeck 1.189e+02 5.586e+03
## BodyPartRegionNon-Standard Code -1.555e+03 2.331e+04
## BodyPartRegionTrunk 4.658e+03 3.423e+03
## BodyPartRegionUpper Extremities 9.662e+02 2.994e+03
## AverageWeeklyWage1 4.569e+00 3.241e+00
## BillReviewALE 3.181e-02 6.144e-01
## Hospital -7.270e-04 2.014e-02
## PhysicianOutpatient -1.361e-02 5.445e-02
## Rx 7.366e-03 4.369e-02
## t value Pr(>|t|)
## (Intercept) 2.822 0.00480 **
## ClaimStatusO 38.348 < 2e-16 ***
## ClaimStatusR 24.432 < 2e-16 ***
## IsDenied -1.941 0.05236 .
## Procesing_Time 9.645 < 2e-16 ***
## ClaimantAge_at_DOI -0.382 0.70227
## GenderMale 0.205 0.83777
## GenderNot Available 0.169 0.86613
## ClaimantTypeMedical Only -12.280 < 2e-16 ***
## ClaimantTypeReport Only -5.034 5.07e-07 ***
## InjuryNatureAsbestosis 0.208 0.83536
## InjuryNatureBurn 0.024 0.98100
## InjuryNatureCarpal Tunnel Syndrome -0.369 0.71180
## InjuryNatureConcussion 1.251 0.21111
## InjuryNatureContagious Disease 0.304 0.76103
## InjuryNatureContusion 0.814 0.41599
## InjuryNatureCrushing 0.206 0.83672
## InjuryNatureDermatitis 0.572 0.56707
## InjuryNatureDislocation -2.594 0.00953 **
## InjuryNatureDust Disease, NOC 0.016 0.98690
## InjuryNatureElectric Shock 0.202 0.84025
## InjuryNatureForeign Body 0.719 0.47198
## InjuryNatureFracture 0.691 0.48939
## InjuryNatureHearing Loss Or Impairment -0.034 0.97290
## InjuryNatureHeat Prostration -0.078 0.93744
## InjuryNatureHernia -0.427 0.66957
## InjuryNatureInfection -0.144 0.88562
## InjuryNatureInflammation 0.501 0.61674
## InjuryNatureLaceration 0.644 0.51980
## InjuryNatureLoss of Hearing -0.768 0.44229
## InjuryNatureMental Stress -0.717 0.47332
## InjuryNatureMultiple Physical Injuries Only 1.915 0.05554 .
## InjuryNatureMyocardial Infarction -0.537 0.59102
## InjuryNatureNo Physical Injury 0.450 0.65276
## InjuryNatureNon-Standard Code -0.075 0.94008
## InjuryNaturePoisoning?Chemical (Other Than Metals) 0.002 0.99820
## InjuryNaturePuncture 0.769 0.44208
## InjuryNatureRespiratory Disorders -0.332 0.74022
## InjuryNatureRupture -0.690 0.49038
## InjuryNatureSeverance 0.278 0.78075
## InjuryNatureSprain -1.019 0.30811
## InjuryNatureStrain 0.921 0.35736
## InjuryNatureSyncope 0.193 0.84659
## BodyPartRegionLower Extremities 1.308 0.19081
## BodyPartRegionMultiple Body Parts 0.191 0.84862
## BodyPartRegionNeck 0.021 0.98302
## BodyPartRegionNon-Standard Code -0.067 0.94682
## BodyPartRegionTrunk 1.360 0.17377
## BodyPartRegionUpper Extremities 0.323 0.74697
## AverageWeeklyWage1 1.410 0.15871
## BillReviewALE 0.052 0.95871
## Hospital -0.036 0.97120
## PhysicianOutpatient -0.250 0.80267
## Rx 0.169 0.86612
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42770 on 3191 degrees of freedom
## (28374 observations deleted due to missingness)
## Multiple R-squared: 0.6116, Adjusted R-squared: 0.6051
## F-statistic: 94.8 on 53 and 3191 DF, p-value: < 2.2e-16
# Evaluar, y en caso necesario, ajustar la regresion
regresion <- lm(TotalIncurredCost ~ ClaimStatus + IsDenied + Procesing_Time + Gender + ClaimantType + Rx, data=base_de_datos)
summary(regresion)
##
## Call:
## lm(formula = TotalIncurredCost ~ ClaimStatus + IsDenied + Procesing_Time +
## Gender + ClaimantType + Rx, data = base_de_datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -613611 -1298 332 925 2635418
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.171e+04 1.347e+03 16.118 < 2e-16 ***
## ClaimStatusO 4.357e+05 6.002e+03 72.589 < 2e-16 ***
## ClaimStatusR 2.543e+05 7.006e+03 36.296 < 2e-16 ***
## IsDenied -9.765e+03 2.724e+03 -3.585 0.000339 ***
## Procesing_Time 3.809e+01 2.242e+00 16.991 < 2e-16 ***
## GenderMale 6.080e+02 1.097e+03 0.554 0.579469
## GenderNot Available -6.497e+03 9.556e+03 -0.680 0.496640
## ClaimantTypeMedical Only -2.263e+04 1.366e+03 -16.562 < 2e-16 ***
## ClaimantTypeReport Only -2.315e+04 3.028e+03 -7.648 2.22e-14 ***
## Rx 1.172e-02 4.309e-02 0.272 0.785670
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56980 on 10879 degrees of freedom
## (20730 observations deleted due to missingness)
## Multiple R-squared: 0.525, Adjusted R-squared: 0.5246
## F-statistic: 1336 on 9 and 10879 DF, p-value: < 2.2e-16
# Construir un modelo de prediccion
datos_nuevos <- data.frame(Procesing_Time=12, ClaimStatus="C", ClaimantType="Indemnity", IsDenied=0, Gender="Female", Rx=154.97)
predict(regresion,datos_nuevos)
## 1
## 22172.82
# En conclusion el modelo puede mejorar, algunas de las variables que hacen mi modelo mas significativo tienen muchas opciones sin datos en la base de datos