Teoria

lm() es la función de R para ajustar modelos lineales. Es el modelo estadistico nás básico que existe Y más fácil de interpretar. Para interpretarlo se usa la medida R-cuadrada, que significa quế tan cerca estan 1os datos de la 1ínea de regresión ajustada (Va de 0 a 1, donde 1 es que el node10 explica toda la variabilidad).

# Modelo Predictivo

# Importar base de datos
base_de_datos <- read.csv("C:\\Users\\lcbor\\Downloads\\seguros.csv")

# Entender la base de datos
resumen <- summary(base_de_datos)
resumen
##     ClaimID           TotalPaid       TotalReserves     TotalRecovery      
##  Min.   :  777632   Min.   :      0   Min.   :      0   Min.   :     0.00  
##  1st Qu.:  800748   1st Qu.:     83   1st Qu.:      0   1st Qu.:     0.00  
##  Median :  812128   Median :    271   Median :      0   Median :     0.00  
##  Mean   : 1864676   Mean   :  10404   Mean   :   3368   Mean   :    66.05  
##  3rd Qu.:  824726   3rd Qu.:   1122   3rd Qu.:      0   3rd Qu.:     0.00  
##  Max.   :62203364   Max.   :4527291   Max.   :1529053   Max.   :100000.00  
##                                                                            
##  IndemnityPaid      OtherPaid       TotalIncurredCost ClaimStatus       
##  Min.   :     0   Min.   :      0   Min.   : -10400   Length:31619      
##  1st Qu.:     0   1st Qu.:     80   1st Qu.:     80   Class :character  
##  Median :     0   Median :    265   Median :    266   Mode  :character  
##  Mean   :  4977   Mean   :   5427   Mean   :  13706                     
##  3rd Qu.:     0   3rd Qu.:   1023   3rd Qu.:   1098                     
##  Max.   :640732   Max.   :4129915   Max.   :4734750                     
##                                                                         
##  IncidentDate       IncidentDescription ReturnToWorkDate   ClaimantOpenedDate
##  Length:31619       Length:31619        Length:31619       Length:31619      
##  Class :character   Class :character    Class :character   Class :character  
##  Mode  :character   Mode  :character    Mode  :character   Mode  :character  
##                                                                              
##                                                                              
##                                                                              
##                                                                              
##  ClaimantClosedDate EmployerNotificationDate ReceivedDate      
##  Length:31619       Length:31619             Length:31619      
##  Class :character   Class :character         Class :character  
##  Mode  :character   Mode  :character         Mode  :character  
##                                                                
##                                                                
##                                                                
##                                                                
##     IsDenied       Transaction_Time Procesing_Time     ClaimantAge_at_DOI
##  Min.   :0.00000   Min.   :    0    Min.   :    0.00   Min.   :14.0      
##  1st Qu.:0.00000   1st Qu.:  211    1st Qu.:    4.00   1st Qu.:33.0      
##  Median :0.00000   Median :  780    Median :   10.00   Median :42.0      
##  Mean   :0.04463   Mean   : 1004    Mean   :   62.99   Mean   :41.6      
##  3rd Qu.:0.00000   3rd Qu.: 1440    3rd Qu.:   24.00   3rd Qu.:50.0      
##  Max.   :1.00000   Max.   :16428    Max.   :11558.00   Max.   :94.0      
##                    NA's   :614                                           
##     Gender          ClaimantType       InjuryNature       BodyPartRegion    
##  Length:31619       Length:31619       Length:31619       Length:31619      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    BodyPart         AverageWeeklyWage1    ClaimID1        BillReviewALE    
##  Length:31619       Min.   : 100.0     Min.   :  777632   Min.   : -448.0  
##  Class :character   1st Qu.: 492.0     1st Qu.:  800748   1st Qu.:   16.0  
##  Mode  :character   Median : 492.0     Median :  812128   Median :   24.0  
##                     Mean   : 536.5     Mean   : 1864676   Mean   :  188.7  
##                     3rd Qu.: 492.0     3rd Qu.:  824726   3rd Qu.:   64.1  
##                     Max.   :8613.5     Max.   :62203364   Max.   :46055.3  
##                                                           NA's   :14912    
##     Hospital         PhysicianOutpatient       Rx          
##  Min.   : -12570.4   Min.   :   -549.5   Min.   :  -160.7  
##  1st Qu.:    210.5   1st Qu.:    105.8   1st Qu.:    22.9  
##  Median :    613.9   Median :    218.0   Median :    61.5  
##  Mean   :   5113.2   Mean   :   1813.2   Mean   :  1695.2  
##  3rd Qu.:   2349.1   3rd Qu.:    680.6   3rd Qu.:   189.0  
##  Max.   :2759604.0   Max.   :1219766.6   Max.   :631635.5  
##  NA's   :19655       NA's   :2329        NA's   :20730
str(base_de_datos)
## 'data.frame':    31619 obs. of  30 variables:
##  $ ClaimID                 : int  777632 777646 777651 777661 777666 777668 777673 777676 777681 777694 ...
##  $ TotalPaid               : num  20 0 5407 288 235 ...
##  $ TotalReserves           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ TotalRecovery           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ IndemnityPaid           : num  0 0 2246 0 0 ...
##  $ OtherPaid               : num  20 0 3161 288 235 ...
##  $ TotalIncurredCost       : num  20 0 5407 288 235 ...
##  $ ClaimStatus             : chr  "C" "C" "C" "C" ...
##  $ IncidentDate            : chr  "01-jul-99" "01-jul-99" "02-jul-99" "02-jul-99" ...
##  $ IncidentDescription     : chr  "EE. States while going down the steps--EE foot slipped & she fell down to cement on ground--injurher rt. Knee, "| __truncated__ "Neck, arm, wrist and neck... On going for 20 yrs, repetitive, performing ultrasound exam on patients. Repetitiv"| __truncated__ "EE was removing items from freezer when items started to fall - EE had to get out of way of falling boxes- strained back" "EE states the fumes from a truck caused her to getan upset stomach and headache." ...
##  $ ReturnToWorkDate        : chr  "" "" "" "" ...
##  $ ClaimantOpenedDate      : chr  "12-jul-99" "29-sep-05" "14-jul-99" "20-jul-99" ...
##  $ ClaimantClosedDate      : chr  "31-mar-05" "22DEC2005" "31-mar-05" "17-jun-08" ...
##  $ EmployerNotificationDate: chr  "01-jul-99" "01-jul-99" "02-jul-99" "02-jul-99" ...
##  $ ReceivedDate            : chr  "12-jul-99" "29-sep-05" "13-jul-99" "20-jul-99" ...
##  $ IsDenied                : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Transaction_Time        : int  2089 84 2087 3255 2050 2195 2493 2058 2086 2900 ...
##  $ Procesing_Time          : int  11 2282 12 18 49 9 8 39 10 13 ...
##  $ ClaimantAge_at_DOI      : int  49 48 33 29 54 49 38 48 40 51 ...
##  $ Gender                  : chr  "Female" "Female" "Female" "Female" ...
##  $ ClaimantType            : chr  "Medical Only" "Indemnity" "Indemnity" "Medical Only" ...
##  $ InjuryNature            : chr  "Laceration" "All Other Specific Injuries, Noc" "Strain" "Respiratory Disorders" ...
##  $ BodyPartRegion          : chr  "Multiple Body Parts" "Multiple Body Parts" "Trunk" "Multiple Body Parts" ...
##  $ BodyPart                : chr  "Multiple Body Parts (Including Body Systems and Body Parts)" "Multiple Body Parts (Including Body Systems and Body Parts)" "Lower Back Area" "Multiple Body Parts (Including Body Systems and Body Parts)" ...
##  $ AverageWeeklyWage1      : num  492 673 328 492 492 ...
##  $ ClaimID1                : int  777632 777646 777651 777661 777666 777668 777673 777676 777681 777694 ...
##  $ BillReviewALE           : num  NA 592 NA NA NA ...
##  $ Hospital                : num  NA 36342 299 NA NA ...
##  $ PhysicianOutpatient     : num  NA 41773.4 155 65.5 221.6 ...
##  $ Rx                      : num  50.3 3108.5 NA NA NA ...
# Generar regresion (modelo lineal)
regresion <- lm(TotalIncurredCost ~ ClaimStatus + IsDenied + Procesing_Time + ClaimantAge_at_DOI + Gender + ClaimantType + InjuryNature + BodyPartRegion + AverageWeeklyWage1 + BillReviewALE + Hospital + PhysicianOutpatient + Rx, data=base_de_datos)
summary(regresion)
## 
## Call:
## lm(formula = TotalIncurredCost ~ ClaimStatus + IsDenied + Procesing_Time + 
##     ClaimantAge_at_DOI + Gender + ClaimantType + InjuryNature + 
##     BodyPartRegion + AverageWeeklyWage1 + BillReviewALE + Hospital + 
##     PhysicianOutpatient + Rx, data = base_de_datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -454179   -3153      24    2285 1063665 
## 
## Coefficients:
##                                                      Estimate Std. Error
## (Intercept)                                         1.701e+04  6.025e+03
## ClaimStatusO                                        4.490e+05  1.171e+04
## ClaimStatusR                                        3.190e+05  1.305e+04
## IsDenied                                           -8.474e+03  4.366e+03
## Procesing_Time                                      4.647e+01  4.818e+00
## ClaimantAge_at_DOI                                 -2.612e+01  6.833e+01
## GenderMale                                          3.149e+02  1.538e+03
## GenderNot Available                                 2.031e+03  1.205e+04
## ClaimantTypeMedical Only                           -2.477e+04  2.017e+03
## ClaimantTypeReport Only                            -2.431e+04  4.829e+03
## InjuryNatureAsbestosis                              6.366e+03  3.063e+04
## InjuryNatureBurn                                    1.883e+02  7.904e+03
## InjuryNatureCarpal Tunnel Syndrome                 -3.840e+03  1.039e+04
## InjuryNatureConcussion                              2.100e+04  1.679e+04
## InjuryNatureContagious Disease                      5.091e+03  1.674e+04
## InjuryNatureContusion                               3.604e+03  4.430e+03
## InjuryNatureCrushing                                3.074e+03  1.492e+04
## InjuryNatureDermatitis                              4.036e+03  7.050e+03
## InjuryNatureDislocation                            -3.692e+04  1.423e+04
## InjuryNatureDust Disease, NOC                       7.068e+02  4.304e+04
## InjuryNatureElectric Shock                          5.065e+03  2.512e+04
## InjuryNatureForeign Body                            4.459e+03  6.199e+03
## InjuryNatureFracture                                4.540e+03  6.567e+03
## InjuryNatureHearing Loss Or Impairment             -8.563e+02  2.521e+04
## InjuryNatureHeat Prostration                       -1.972e+03  2.513e+04
## InjuryNatureHernia                                 -9.356e+03  2.192e+04
## InjuryNatureInfection                              -2.409e+03  1.674e+04
## InjuryNatureInflammation                            4.660e+03  9.310e+03
## InjuryNatureLaceration                              3.311e+03  5.143e+03
## InjuryNatureLoss of Hearing                        -1.943e+04  2.528e+04
## InjuryNatureMental Stress                          -1.419e+04  1.978e+04
## InjuryNatureMultiple Physical Injuries Only         1.472e+04  7.683e+03
## InjuryNatureMyocardial Infarction                  -1.661e+04  3.091e+04
## InjuryNatureNo Physical Injury                      3.799e+03  8.442e+03
## InjuryNatureNon-Standard Code                      -1.644e+03  2.187e+04
## InjuryNaturePoisoning?Chemical (Other Than Metals)  4.469e+01  1.979e+04
## InjuryNaturePuncture                                4.340e+03  5.646e+03
## InjuryNatureRespiratory Disorders                  -2.827e+03  8.525e+03
## InjuryNatureRupture                                -2.115e+04  3.065e+04
## InjuryNatureSeverance                               8.548e+03  3.071e+04
## InjuryNatureSprain                                 -5.494e+03  5.390e+03
## InjuryNatureStrain                                  4.182e+03  4.543e+03
## InjuryNatureSyncope                                 3.808e+03  1.968e+04
## BodyPartRegionLower Extremities                     4.185e+03  3.199e+03
## BodyPartRegionMultiple Body Parts                   6.375e+02  3.339e+03
## BodyPartRegionNeck                                  1.189e+02  5.586e+03
## BodyPartRegionNon-Standard Code                    -1.555e+03  2.331e+04
## BodyPartRegionTrunk                                 4.658e+03  3.423e+03
## BodyPartRegionUpper Extremities                     9.662e+02  2.994e+03
## AverageWeeklyWage1                                  4.569e+00  3.241e+00
## BillReviewALE                                       3.181e-02  6.144e-01
## Hospital                                           -7.270e-04  2.014e-02
## PhysicianOutpatient                                -1.361e-02  5.445e-02
## Rx                                                  7.366e-03  4.369e-02
##                                                    t value Pr(>|t|)    
## (Intercept)                                          2.822  0.00480 ** 
## ClaimStatusO                                        38.348  < 2e-16 ***
## ClaimStatusR                                        24.432  < 2e-16 ***
## IsDenied                                            -1.941  0.05236 .  
## Procesing_Time                                       9.645  < 2e-16 ***
## ClaimantAge_at_DOI                                  -0.382  0.70227    
## GenderMale                                           0.205  0.83777    
## GenderNot Available                                  0.169  0.86613    
## ClaimantTypeMedical Only                           -12.280  < 2e-16 ***
## ClaimantTypeReport Only                             -5.034 5.07e-07 ***
## InjuryNatureAsbestosis                               0.208  0.83536    
## InjuryNatureBurn                                     0.024  0.98100    
## InjuryNatureCarpal Tunnel Syndrome                  -0.369  0.71180    
## InjuryNatureConcussion                               1.251  0.21111    
## InjuryNatureContagious Disease                       0.304  0.76103    
## InjuryNatureContusion                                0.814  0.41599    
## InjuryNatureCrushing                                 0.206  0.83672    
## InjuryNatureDermatitis                               0.572  0.56707    
## InjuryNatureDislocation                             -2.594  0.00953 ** 
## InjuryNatureDust Disease, NOC                        0.016  0.98690    
## InjuryNatureElectric Shock                           0.202  0.84025    
## InjuryNatureForeign Body                             0.719  0.47198    
## InjuryNatureFracture                                 0.691  0.48939    
## InjuryNatureHearing Loss Or Impairment              -0.034  0.97290    
## InjuryNatureHeat Prostration                        -0.078  0.93744    
## InjuryNatureHernia                                  -0.427  0.66957    
## InjuryNatureInfection                               -0.144  0.88562    
## InjuryNatureInflammation                             0.501  0.61674    
## InjuryNatureLaceration                               0.644  0.51980    
## InjuryNatureLoss of Hearing                         -0.768  0.44229    
## InjuryNatureMental Stress                           -0.717  0.47332    
## InjuryNatureMultiple Physical Injuries Only          1.915  0.05554 .  
## InjuryNatureMyocardial Infarction                   -0.537  0.59102    
## InjuryNatureNo Physical Injury                       0.450  0.65276    
## InjuryNatureNon-Standard Code                       -0.075  0.94008    
## InjuryNaturePoisoning?Chemical (Other Than Metals)   0.002  0.99820    
## InjuryNaturePuncture                                 0.769  0.44208    
## InjuryNatureRespiratory Disorders                   -0.332  0.74022    
## InjuryNatureRupture                                 -0.690  0.49038    
## InjuryNatureSeverance                                0.278  0.78075    
## InjuryNatureSprain                                  -1.019  0.30811    
## InjuryNatureStrain                                   0.921  0.35736    
## InjuryNatureSyncope                                  0.193  0.84659    
## BodyPartRegionLower Extremities                      1.308  0.19081    
## BodyPartRegionMultiple Body Parts                    0.191  0.84862    
## BodyPartRegionNeck                                   0.021  0.98302    
## BodyPartRegionNon-Standard Code                     -0.067  0.94682    
## BodyPartRegionTrunk                                  1.360  0.17377    
## BodyPartRegionUpper Extremities                      0.323  0.74697    
## AverageWeeklyWage1                                   1.410  0.15871    
## BillReviewALE                                        0.052  0.95871    
## Hospital                                            -0.036  0.97120    
## PhysicianOutpatient                                 -0.250  0.80267    
## Rx                                                   0.169  0.86612    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42770 on 3191 degrees of freedom
##   (28374 observations deleted due to missingness)
## Multiple R-squared:  0.6116, Adjusted R-squared:  0.6051 
## F-statistic:  94.8 on 53 and 3191 DF,  p-value: < 2.2e-16
# Evaluar, y en caso necesario, ajustar la regresion 
regresion <- lm(TotalIncurredCost ~ ClaimStatus + IsDenied + Procesing_Time + Gender + ClaimantType + Rx, data=base_de_datos)
summary(regresion)
## 
## Call:
## lm(formula = TotalIncurredCost ~ ClaimStatus + IsDenied + Procesing_Time + 
##     Gender + ClaimantType + Rx, data = base_de_datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -613611   -1298     332     925 2635418 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               2.171e+04  1.347e+03  16.118  < 2e-16 ***
## ClaimStatusO              4.357e+05  6.002e+03  72.589  < 2e-16 ***
## ClaimStatusR              2.543e+05  7.006e+03  36.296  < 2e-16 ***
## IsDenied                 -9.765e+03  2.724e+03  -3.585 0.000339 ***
## Procesing_Time            3.809e+01  2.242e+00  16.991  < 2e-16 ***
## GenderMale                6.080e+02  1.097e+03   0.554 0.579469    
## GenderNot Available      -6.497e+03  9.556e+03  -0.680 0.496640    
## ClaimantTypeMedical Only -2.263e+04  1.366e+03 -16.562  < 2e-16 ***
## ClaimantTypeReport Only  -2.315e+04  3.028e+03  -7.648 2.22e-14 ***
## Rx                        1.172e-02  4.309e-02   0.272 0.785670    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56980 on 10879 degrees of freedom
##   (20730 observations deleted due to missingness)
## Multiple R-squared:  0.525,  Adjusted R-squared:  0.5246 
## F-statistic:  1336 on 9 and 10879 DF,  p-value: < 2.2e-16
# Construir un modelo de prediccion
datos_nuevos <- data.frame(Procesing_Time=12, ClaimStatus="C", ClaimantType="Indemnity", IsDenied=0, Gender="Female", Rx=154.97)
predict(regresion,datos_nuevos)
##        1 
## 22172.82
# En conclusion el modelo puede mejorar, algunas de las variables que hacen mi modelo mas significativo tienen muchas opciones sin datos en la base de datos