Modelo Predictivo

Teoría

lm() es la función de R para ajustar modelos lineales. Es el modelo estadistico mas básico que existe y más fácil de interpretar. Para interpretarlo se usa la medida R-cuadrada, que significa qué tan cerca están los datos de la línea de regresión ajustada( Va de 0 a 1, donde es que el modelo explica toda la variabilidad).

Importar base de datos

#file.choose()
base_de_datos <- read.csv("/Users/sebastianfajardo/Downloads/seguros.csv")

Entender la base de datos

resumen <- summary(base_de_datos)
resumen
##     ClaimID           TotalPaid       TotalReserves     TotalRecovery      
##  Min.   :  777632   Min.   :      0   Min.   :      0   Min.   :     0.00  
##  1st Qu.:  800748   1st Qu.:     83   1st Qu.:      0   1st Qu.:     0.00  
##  Median :  812128   Median :    271   Median :      0   Median :     0.00  
##  Mean   : 1864676   Mean   :  10404   Mean   :   3368   Mean   :    66.05  
##  3rd Qu.:  824726   3rd Qu.:   1122   3rd Qu.:      0   3rd Qu.:     0.00  
##  Max.   :62203364   Max.   :4527291   Max.   :1529053   Max.   :100000.00  
##                                                                            
##  IndemnityPaid      OtherPaid       TotalIncurredCost ClaimStatus       
##  Min.   :     0   Min.   :      0   Min.   : -10400   Length:31619      
##  1st Qu.:     0   1st Qu.:     80   1st Qu.:     80   Class :character  
##  Median :     0   Median :    265   Median :    266   Mode  :character  
##  Mean   :  4977   Mean   :   5427   Mean   :  13706                     
##  3rd Qu.:     0   3rd Qu.:   1023   3rd Qu.:   1098                     
##  Max.   :640732   Max.   :4129915   Max.   :4734750                     
##                                                                         
##  IncidentDate       IncidentDescription ReturnToWorkDate   ClaimantOpenedDate
##  Length:31619       Length:31619        Length:31619       Length:31619      
##  Class :character   Class :character    Class :character   Class :character  
##  Mode  :character   Mode  :character    Mode  :character   Mode  :character  
##                                                                              
##                                                                              
##                                                                              
##                                                                              
##  ClaimantClosedDate EmployerNotificationDate ReceivedDate      
##  Length:31619       Length:31619             Length:31619      
##  Class :character   Class :character         Class :character  
##  Mode  :character   Mode  :character         Mode  :character  
##                                                                
##                                                                
##                                                                
##                                                                
##     IsDenied       Transaction_Time Procesing_Time     ClaimantAge_at_DOI
##  Min.   :0.00000   Min.   :    0    Min.   :    0.00   Min.   :14.0      
##  1st Qu.:0.00000   1st Qu.:  211    1st Qu.:    4.00   1st Qu.:33.0      
##  Median :0.00000   Median :  780    Median :   10.00   Median :42.0      
##  Mean   :0.04463   Mean   : 1004    Mean   :   62.99   Mean   :41.6      
##  3rd Qu.:0.00000   3rd Qu.: 1440    3rd Qu.:   24.00   3rd Qu.:50.0      
##  Max.   :1.00000   Max.   :16428    Max.   :11558.00   Max.   :94.0      
##                    NA's   :614                                           
##     Gender          ClaimantType       InjuryNature       BodyPartRegion    
##  Length:31619       Length:31619       Length:31619       Length:31619      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    BodyPart         AverageWeeklyWage1    ClaimID1        BillReviewALE    
##  Length:31619       Min.   : 100.0     Min.   :  777632   Min.   : -448.0  
##  Class :character   1st Qu.: 492.0     1st Qu.:  800748   1st Qu.:   16.0  
##  Mode  :character   Median : 492.0     Median :  812128   Median :   24.0  
##                     Mean   : 536.5     Mean   : 1864676   Mean   :  188.7  
##                     3rd Qu.: 492.0     3rd Qu.:  824726   3rd Qu.:   64.1  
##                     Max.   :8613.5     Max.   :62203364   Max.   :46055.3  
##                                                           NA's   :14912    
##     Hospital         PhysicianOutpatient       Rx          
##  Min.   : -12570.4   Min.   :   -549.5   Min.   :  -160.7  
##  1st Qu.:    210.5   1st Qu.:    105.8   1st Qu.:    22.9  
##  Median :    613.9   Median :    218.0   Median :    61.5  
##  Mean   :   5113.2   Mean   :   1813.2   Mean   :  1695.2  
##  3rd Qu.:   2349.1   3rd Qu.:    680.6   3rd Qu.:   189.0  
##  Max.   :2759604.0   Max.   :1219766.6   Max.   :631635.5  
##  NA's   :19655       NA's   :2329        NA's   :20730
plot(base_de_datos$ClaimantAge_at_DOI, base_de_datos$TotalIncurredCost, 
     main="Influencia de la Edad del Reclamante sobre el Costo Total Incurrido", 
     xlab="Edad del Reclamante (años)", 
     ylab="Costo Total Incurrido ($)")

boxplot(TotalIncurredCost ~ Gender, data = base_de_datos,
        main="Distribución del Costo Total Incurrido por Género",
        xlab="Género",
        ylab="Costo Total Incurrido ($)")

Generar regresion (modelo lineal)

regresion <- lm(TotalIncurredCost ~ TotalPaid + IndemnityPaid + OtherPaid + Hospital + PhysicianOutpatient + Rx + ClaimantAge_at_DOI + Gender, data = base_de_datos)
summary(regresion)
## 
## Call:
## lm(formula = TotalIncurredCost ~ TotalPaid + IndemnityPaid + 
##     OtherPaid + Hospital + PhysicianOutpatient + Rx + ClaimantAge_at_DOI + 
##     Gender, data = base_de_datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -337669     -73     879    1734  723053 
## 
## Coefficients: (1 not defined because of singularities)
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.460e+03  1.662e+03   0.879   0.3797    
## TotalPaid            1.443e+00  1.558e-02  92.634   <2e-16 ***
## IndemnityPaid        7.633e-02  3.313e-02   2.304   0.0213 *  
## OtherPaid                   NA         NA      NA       NA    
## Hospital             1.087e-03  1.040e-02   0.104   0.9168    
## PhysicianOutpatient  1.093e-03  1.972e-02   0.055   0.9558    
## Rx                  -2.531e-02  2.447e-02  -1.034   0.3010    
## ClaimantAge_at_DOI  -4.679e+01  3.688e+01  -1.269   0.2046    
## GenderMale          -1.529e+03  8.350e+02  -1.831   0.0672 .  
## GenderNot Available -1.238e+04  6.482e+03  -1.910   0.0562 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28810 on 4787 degrees of freedom
##   (26823 observations deleted due to missingness)
## Multiple R-squared:  0.8614, Adjusted R-squared:  0.8612 
## F-statistic:  3719 on 8 and 4787 DF,  p-value: < 2.2e-16

Construir un modelo de prediccion

regresion <- lm(TotalIncurredCost ~ TotalPaid + IndemnityPaid + 
                 Hospital + PhysicianOutpatient + Rx + ClaimantAge_at_DOI + 
                 Gender, data = base_de_datos)
summary(regresion)
## 
## Call:
## lm(formula = TotalIncurredCost ~ TotalPaid + IndemnityPaid + 
##     Hospital + PhysicianOutpatient + Rx + ClaimantAge_at_DOI + 
##     Gender, data = base_de_datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -337669     -73     879    1734  723053 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.460e+03  1.662e+03   0.879   0.3797    
## TotalPaid            1.443e+00  1.558e-02  92.634   <2e-16 ***
## IndemnityPaid        7.633e-02  3.313e-02   2.304   0.0213 *  
## Hospital             1.087e-03  1.040e-02   0.104   0.9168    
## PhysicianOutpatient  1.093e-03  1.972e-02   0.055   0.9558    
## Rx                  -2.531e-02  2.447e-02  -1.034   0.3010    
## ClaimantAge_at_DOI  -4.679e+01  3.688e+01  -1.269   0.2046    
## GenderMale          -1.529e+03  8.350e+02  -1.831   0.0672 .  
## GenderNot Available -1.238e+04  6.482e+03  -1.910   0.0562 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28810 on 4787 degrees of freedom
##   (26823 observations deleted due to missingness)
## Multiple R-squared:  0.8614, Adjusted R-squared:  0.8612 
## F-statistic:  3719 on 8 and 4787 DF,  p-value: < 2.2e-16
datos_nuevos <- data.frame(
  TotalPaid = c(10000, 20000, 30000, 40000, 50000), 
  IndemnityPaid = c(5000, 6000, 7000, 8000, 9000), 
  Hospital = c(1500, 2500, 3500, 4500, 5500), 
  PhysicianOutpatient = c(800, 900, 1000, 1100, 1200), 
  Rx = c(300, 400, 500, 600, 700), 
  ClaimantAge_at_DOI = c(30, 35, 40, 45, 50), 
  Gender = factor(c("Male", "Female", "Male", "Female", "Not Available"), 
                  levels = c("Male", "Female", "Not Available"))
)

predicciones <- predict(regresion, newdata = datos_nuevos)

print(predicciones)
##        1        2        3        4        5 
## 13334.20 29134.09 41876.50 57676.39 59567.99

Conclusión

Las variables TotalPaid e IndemnityPaid son estadísticamente significativas, mostrando un impacto importante en el costo total incurrido.

LS0tCnRpdGxlOiAiUmVncmVzacOzbiBsaW5lYWwtc2VndXJvcyIKYXV0aG9yOiAiU2ViYXN0acOhbiBGYWphcmRvLSBBMDE0MTIwMzUiCmRhdGU6ICIyMDI0LTA4LTE5IgpvdXRwdXQ6IAogIGh0bWxfZG9jdW1lbnQ6CiAgICB0b2M6IFRSVUUKICAgIHRvY19mbG9hdDogVFJVRQogICAgY29kZV9kb3dubG9hZDogVFJVRQogICAgdGhlbWU6IGNvc21vCi0tLQohW10oL1VzZXJzL3NlYmFzdGlhbmZhamFyZG8vRG93bmxvYWRzL3NlZ3Vyb3Mud2VicCkKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBncmF5OyI+TW9kZWxvIFByZWRpY3Rpdm88L3NwYW4+CgojIDxzcGFuIHN0eWxlPSJjb2xvcjogZ3JheTsiPlRlb3LDrWEgPC9zcGFuPgoqKmxtKCkqKiBlcyBsYSBmdW5jacOzbiBkZSBSIHBhcmEgYWp1c3RhciBtb2RlbG9zIGxpbmVhbGVzLiAKRXMgZWwgbW9kZWxvIGVzdGFkaXN0aWNvIG1hcyBiw6FzaWNvIHF1ZSBleGlzdGUgeSBtw6FzIGbDoWNpbCBkZSBpbnRlcnByZXRhci4gClBhcmEgaW50ZXJwcmV0YXJsbyBzZSB1c2EgbGEgbWVkaWRhIFItY3VhZHJhZGEsIHF1ZSBzaWduaWZpY2EgcXXDqSB0YW4gY2VyY2EgZXN0w6FuIGxvcyBkYXRvcyBkZSBsYSBsw61uZWEgZGUgcmVncmVzacOzbiBhanVzdGFkYSggVmEgZGUgMCBhIDEsIGRvbmRlIGVzIHF1ZSBlbCBtb2RlbG8gZXhwbGljYSB0b2RhIGxhIHZhcmlhYmlsaWRhZCkuCgojIDxzcGFuIHN0eWxlPSJjb2xvcjogZ3JheTsiPiBJbXBvcnRhciBiYXNlIGRlIGRhdG9zPC9zcGFuPgpgYGB7cn0KI2ZpbGUuY2hvb3NlKCkKYmFzZV9kZV9kYXRvcyA8LSByZWFkLmNzdigiL1VzZXJzL3NlYmFzdGlhbmZhamFyZG8vRG93bmxvYWRzL3NlZ3Vyb3MuY3N2IikKYGBgCgojIDxzcGFuIHN0eWxlPSJjb2xvcjogZ3JheTsiPkVudGVuZGVyIGxhIGJhc2UgZGUgZGF0b3M8L3NwYW4+CmBgYHtyfQpyZXN1bWVuIDwtIHN1bW1hcnkoYmFzZV9kZV9kYXRvcykKcmVzdW1lbgpgYGAKYGBge3J9CnBsb3QoYmFzZV9kZV9kYXRvcyRDbGFpbWFudEFnZV9hdF9ET0ksIGJhc2VfZGVfZGF0b3MkVG90YWxJbmN1cnJlZENvc3QsIAogICAgIG1haW49IkluZmx1ZW5jaWEgZGUgbGEgRWRhZCBkZWwgUmVjbGFtYW50ZSBzb2JyZSBlbCBDb3N0byBUb3RhbCBJbmN1cnJpZG8iLCAKICAgICB4bGFiPSJFZGFkIGRlbCBSZWNsYW1hbnRlIChhw7FvcykiLCAKICAgICB5bGFiPSJDb3N0byBUb3RhbCBJbmN1cnJpZG8gKCQpIikKYGBgCmBgYHtyfQpib3hwbG90KFRvdGFsSW5jdXJyZWRDb3N0IH4gR2VuZGVyLCBkYXRhID0gYmFzZV9kZV9kYXRvcywKICAgICAgICBtYWluPSJEaXN0cmlidWNpw7NuIGRlbCBDb3N0byBUb3RhbCBJbmN1cnJpZG8gcG9yIEfDqW5lcm8iLAogICAgICAgIHhsYWI9IkfDqW5lcm8iLAogICAgICAgIHlsYWI9IkNvc3RvIFRvdGFsIEluY3VycmlkbyAoJCkiKQpgYGAKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBncmF5OyI+R2VuZXJhciByZWdyZXNpb24gKG1vZGVsbyBsaW5lYWwpPC9zcGFuPgpgYGB7cn0KcmVncmVzaW9uIDwtIGxtKFRvdGFsSW5jdXJyZWRDb3N0IH4gVG90YWxQYWlkICsgSW5kZW1uaXR5UGFpZCArIE90aGVyUGFpZCArIEhvc3BpdGFsICsgUGh5c2ljaWFuT3V0cGF0aWVudCArIFJ4ICsgQ2xhaW1hbnRBZ2VfYXRfRE9JICsgR2VuZGVyLCBkYXRhID0gYmFzZV9kZV9kYXRvcykKc3VtbWFyeShyZWdyZXNpb24pCmBgYAoKIyA8c3BhbiBzdHlsZT0iY29sb3I6IGdyYXk7Ij5Db25zdHJ1aXIgdW4gbW9kZWxvIGRlIHByZWRpY2Npb24gPC9zcGFuPgpgYGB7cn0KCnJlZ3Jlc2lvbiA8LSBsbShUb3RhbEluY3VycmVkQ29zdCB+IFRvdGFsUGFpZCArIEluZGVtbml0eVBhaWQgKyAKICAgICAgICAgICAgICAgICBIb3NwaXRhbCArIFBoeXNpY2lhbk91dHBhdGllbnQgKyBSeCArIENsYWltYW50QWdlX2F0X0RPSSArIAogICAgICAgICAgICAgICAgIEdlbmRlciwgZGF0YSA9IGJhc2VfZGVfZGF0b3MpCnN1bW1hcnkocmVncmVzaW9uKQoKZGF0b3NfbnVldm9zIDwtIGRhdGEuZnJhbWUoCiAgVG90YWxQYWlkID0gYygxMDAwMCwgMjAwMDAsIDMwMDAwLCA0MDAwMCwgNTAwMDApLCAKICBJbmRlbW5pdHlQYWlkID0gYyg1MDAwLCA2MDAwLCA3MDAwLCA4MDAwLCA5MDAwKSwgCiAgSG9zcGl0YWwgPSBjKDE1MDAsIDI1MDAsIDM1MDAsIDQ1MDAsIDU1MDApLCAKICBQaHlzaWNpYW5PdXRwYXRpZW50ID0gYyg4MDAsIDkwMCwgMTAwMCwgMTEwMCwgMTIwMCksIAogIFJ4ID0gYygzMDAsIDQwMCwgNTAwLCA2MDAsIDcwMCksIAogIENsYWltYW50QWdlX2F0X0RPSSA9IGMoMzAsIDM1LCA0MCwgNDUsIDUwKSwgCiAgR2VuZGVyID0gZmFjdG9yKGMoIk1hbGUiLCAiRmVtYWxlIiwgIk1hbGUiLCAiRmVtYWxlIiwgIk5vdCBBdmFpbGFibGUiKSwgCiAgICAgICAgICAgICAgICAgIGxldmVscyA9IGMoIk1hbGUiLCAiRmVtYWxlIiwgIk5vdCBBdmFpbGFibGUiKSkKKQoKcHJlZGljY2lvbmVzIDwtIHByZWRpY3QocmVncmVzaW9uLCBuZXdkYXRhID0gZGF0b3NfbnVldm9zKQoKcHJpbnQocHJlZGljY2lvbmVzKQpgYGAKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiBncmF5OyI+Q29uY2x1c2nDs24gPC9zcGFuPgpMYXMgdmFyaWFibGVzIFRvdGFsUGFpZCBlIEluZGVtbml0eVBhaWQgc29uIGVzdGFkw61zdGljYW1lbnRlIHNpZ25pZmljYXRpdmFzLCBtb3N0cmFuZG8gdW4gaW1wYWN0byBpbXBvcnRhbnRlIGVuIGVsIGNvc3RvIHRvdGFsIGluY3Vycmlkby4K