Modelo Predictivo

Importar base de datos

#file.choose()
base_de_datos <- read.csv("/Users/agustingomezperez/Desktop/Inteligencia Artificial/seguros (1).csv")

Entender la base de datos

library(corrplot)
## corrplot 0.92 loaded
resumen <- summary(base_de_datos)
resumen
##     ClaimID           TotalPaid       TotalReserves     TotalRecovery      
##  Min.   :  777632   Min.   :      0   Min.   :      0   Min.   :     0.00  
##  1st Qu.:  800748   1st Qu.:     83   1st Qu.:      0   1st Qu.:     0.00  
##  Median :  812128   Median :    271   Median :      0   Median :     0.00  
##  Mean   : 1864676   Mean   :  10404   Mean   :   3368   Mean   :    66.05  
##  3rd Qu.:  824726   3rd Qu.:   1122   3rd Qu.:      0   3rd Qu.:     0.00  
##  Max.   :62203364   Max.   :4527291   Max.   :1529053   Max.   :100000.00  
##                                                                            
##  IndemnityPaid      OtherPaid       TotalIncurredCost ClaimStatus       
##  Min.   :     0   Min.   :      0   Min.   : -10400   Length:31619      
##  1st Qu.:     0   1st Qu.:     80   1st Qu.:     80   Class :character  
##  Median :     0   Median :    265   Median :    266   Mode  :character  
##  Mean   :  4977   Mean   :   5427   Mean   :  13706                     
##  3rd Qu.:     0   3rd Qu.:   1023   3rd Qu.:   1098                     
##  Max.   :640732   Max.   :4129915   Max.   :4734750                     
##                                                                         
##  IncidentDate       IncidentDescription ReturnToWorkDate   ClaimantOpenedDate
##  Length:31619       Length:31619        Length:31619       Length:31619      
##  Class :character   Class :character    Class :character   Class :character  
##  Mode  :character   Mode  :character    Mode  :character   Mode  :character  
##                                                                              
##                                                                              
##                                                                              
##                                                                              
##  ClaimantClosedDate EmployerNotificationDate ReceivedDate      
##  Length:31619       Length:31619             Length:31619      
##  Class :character   Class :character         Class :character  
##  Mode  :character   Mode  :character         Mode  :character  
##                                                                
##                                                                
##                                                                
##                                                                
##     IsDenied       Transaction_Time Procesing_Time     ClaimantAge_at_DOI
##  Min.   :0.00000   Min.   :    0    Min.   :    0.00   Min.   :14.0      
##  1st Qu.:0.00000   1st Qu.:  211    1st Qu.:    4.00   1st Qu.:33.0      
##  Median :0.00000   Median :  780    Median :   10.00   Median :42.0      
##  Mean   :0.04463   Mean   : 1004    Mean   :   62.99   Mean   :41.6      
##  3rd Qu.:0.00000   3rd Qu.: 1440    3rd Qu.:   24.00   3rd Qu.:50.0      
##  Max.   :1.00000   Max.   :16428    Max.   :11558.00   Max.   :94.0      
##                    NA's   :614                                           
##     Gender          ClaimantType       InjuryNature       BodyPartRegion    
##  Length:31619       Length:31619       Length:31619       Length:31619      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    BodyPart         AverageWeeklyWage1    ClaimID1        BillReviewALE    
##  Length:31619       Min.   : 100.0     Min.   :  777632   Min.   : -448.0  
##  Class :character   1st Qu.: 492.0     1st Qu.:  800748   1st Qu.:   16.0  
##  Mode  :character   Median : 492.0     Median :  812128   Median :   24.0  
##                     Mean   : 536.5     Mean   : 1864676   Mean   :  188.7  
##                     3rd Qu.: 492.0     3rd Qu.:  824726   3rd Qu.:   64.1  
##                     Max.   :8613.5     Max.   :62203364   Max.   :46055.3  
##                                                           NA's   :14912    
##     Hospital         PhysicianOutpatient       Rx          
##  Min.   : -12570.4   Min.   :   -549.5   Min.   :  -160.7  
##  1st Qu.:    210.5   1st Qu.:    105.8   1st Qu.:    22.9  
##  Median :    613.9   Median :    218.0   Median :    61.5  
##  Mean   :   5113.2   Mean   :   1813.2   Mean   :  1695.2  
##  3rd Qu.:   2349.1   3rd Qu.:    680.6   3rd Qu.:   189.0  
##  Max.   :2759604.0   Max.   :1219766.6   Max.   :631635.5  
##  NA's   :19655       NA's   :2329        NA's   :20730

#Transformar y encontrar correlación

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
base_de_datos<-base_de_datos %>% select_if(is.numeric)
base_de_datos<-base_de_datos[, colSums(is.na(base_de_datos)) == 0]
corrplot(cor(base_de_datos),method='color',addCoef.col='black', order='FPC')

# Generar regresion (modelo lineal)

regresion <- lm(TotalIncurredCost ~ TotalPaid + IndemnityPaid + OtherPaid + TotalReserves + Procesing_Time + TotalRecovery + AverageWeeklyWage1 + ClaimantAge_at_DOI, data=base_de_datos)
summary(regresion)
## 
## Call:
## lm(formula = TotalIncurredCost ~ TotalPaid + IndemnityPaid + 
##     OtherPaid + TotalReserves + Procesing_Time + TotalRecovery + 
##     AverageWeeklyWage1 + ClaimantAge_at_DOI, data = base_de_datos)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.145e-08 -4.000e-11 -2.000e-11 -1.000e-11  5.739e-07 
## 
## Coefficients: (1 not defined because of singularities)
##                      Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)         4.138e-10  8.651e-11  4.783e+00 1.74e-06 ***
## TotalPaid           1.000e+00  6.298e-16  1.588e+15  < 2e-16 ***
## IndemnityPaid      -7.199e-14  1.408e-15 -5.113e+01  < 2e-16 ***
## OtherPaid                  NA         NA         NA       NA    
## TotalReserves       1.000e+00  6.693e-16  1.494e+15  < 2e-16 ***
## Procesing_Time      2.635e-13  7.521e-14  3.503e+00  0.00046 ***
## TotalRecovery      -1.000e+00  1.554e-14 -6.435e+13  < 2e-16 ***
## AverageWeeklyWage1  2.437e-14  9.297e-14  2.620e-01  0.79322    
## ClaimantAge_at_DOI -1.465e-12  1.803e-12 -8.130e-01  0.41637    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.608e-09 on 31611 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 2.464e+30 on 7 and 31611 DF,  p-value: < 2.2e-16

Evaluar, y en caso necesario, ajustar la regresion

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
regresion <- lm(TotalIncurredCost ~ TotalReserves + Procesing_Time + TotalRecovery  + ClaimantAge_at_DOI , data=base_de_datos)
summary(regresion)
## 
## Call:
## lm(formula = TotalIncurredCost ~ TotalReserves + Procesing_Time + 
##     TotalRecovery + ClaimantAge_at_DOI, data = base_de_datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -744596   -7023   -4820   -2505 4314887 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -2.871e+03  9.690e+02  -2.963  0.00305 ** 
## TotalReserves       1.712e+00  7.273e-03 235.354  < 2e-16 ***
## Procesing_Time      4.582e+01  8.844e-01  51.805  < 2e-16 ***
## TotalRecovery       1.320e+00  1.951e-01   6.765 1.35e-11 ***
## ClaimantAge_at_DOI  1.884e+02  2.246e+01   8.388  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45470 on 31614 degrees of freedom
## Multiple R-squared:  0.7089, Adjusted R-squared:  0.7088 
## F-statistic: 1.924e+04 on 4 and 31614 DF,  p-value: < 2.2e-16
vif(regresion)
##      TotalReserves     Procesing_Time      TotalRecovery ClaimantAge_at_DOI 
##           1.147592           1.139177           1.008341           1.001373

Construir un modelo de prediccion

datos_nuevos <- data.frame(TotalReserves=5000, Procesing_Time=60, TotalRecovery=2013, ClaimantAge_at_DOI =62)
predict(regresion,datos_nuevos)
##        1 
## 22777.27

Un modelo de predicción usa una regresión lineal para estimar futuros resultados, para poderlo evaluar necesitamos encontrar cuales son las variables estadísticamente significativas que ayudan a predecir nuestra variable dependiente, también es importante tener en cuenta que pueden existir otros problemas como la multicolinealidad que podrán afectar al modelo predictivo ya que hay varias variables que se explican entre sí como en el modelo 1, si nosotros eliminamos estas variables tendremos un modelo que si será capaz de hacer predicciones y en caso de que queremos que sea más efectivo en cuanto a su R cuadrada, necesitaremos buscar más variables explicativas que no estén correlacionadas entre sí.

LS0tCnRpdGxlOiAiUmVncmVzacOzbiBMaW5lYWwiCmF1dGhvcjogIkFndXN0w61uIEfDs21leiBQZXJleiBBMDE3MzI4OTciCmRhdGU6ICIyMDI0LTA4LTE5IgpvdXRwdXQ6IAogIGh0bWxfZG9jdW1lbnQ6CiAgICB0b2M6IHRydWUKICAgIHRvY19mbG9hdDogdHJ1ZQogICAgY29kZV9kb3dubG9hZDogdHJ1ZSAKLS0tCgohW10oaHR0cHM6Ly9oaXBzLmhlYXJzdGFwcHMuY29tL2htZy1wcm9kL2ltYWdlcy9mb3JkLW11c3RhbmctY3Jhc2gtMi0xNjM2ODg2MTYwLmdpZj9yZXNpemU9MTIwMDoqKQoKIyBNb2RlbG8gUHJlZGljdGl2bwoKIyBJbXBvcnRhciBiYXNlIGRlIGRhdG9zCmBgYHtyfQojZmlsZS5jaG9vc2UoKQpiYXNlX2RlX2RhdG9zIDwtIHJlYWQuY3N2KCIvVXNlcnMvYWd1c3RpbmdvbWV6cGVyZXovRGVza3RvcC9JbnRlbGlnZW5jaWEgQXJ0aWZpY2lhbC9zZWd1cm9zICgxKS5jc3YiKQpgYGAKIyBFbnRlbmRlciBsYSBiYXNlIGRlIGRhdG9zCmBgYHtyfQpsaWJyYXJ5KGNvcnJwbG90KQpyZXN1bWVuIDwtIHN1bW1hcnkoYmFzZV9kZV9kYXRvcykKcmVzdW1lbgpgYGAKCgojVHJhbnNmb3JtYXIgeSBlbmNvbnRyYXIgY29ycmVsYWNpw7NuCmBgYHtyfQpsaWJyYXJ5KGRwbHlyKQpiYXNlX2RlX2RhdG9zPC1iYXNlX2RlX2RhdG9zICU+JSBzZWxlY3RfaWYoaXMubnVtZXJpYykKYmFzZV9kZV9kYXRvczwtYmFzZV9kZV9kYXRvc1ssIGNvbFN1bXMoaXMubmEoYmFzZV9kZV9kYXRvcykpID09IDBdCmNvcnJwbG90KGNvcihiYXNlX2RlX2RhdG9zKSxtZXRob2Q9J2NvbG9yJyxhZGRDb2VmLmNvbD0nYmxhY2snLCBvcmRlcj0nRlBDJykKYGBgCiMgR2VuZXJhciByZWdyZXNpb24gKG1vZGVsbyBsaW5lYWwpCgpgYGB7cn0KcmVncmVzaW9uIDwtIGxtKFRvdGFsSW5jdXJyZWRDb3N0IH4gVG90YWxQYWlkICsgSW5kZW1uaXR5UGFpZCArIE90aGVyUGFpZCArIFRvdGFsUmVzZXJ2ZXMgKyBQcm9jZXNpbmdfVGltZSArIFRvdGFsUmVjb3ZlcnkgKyBBdmVyYWdlV2Vla2x5V2FnZTEgKyBDbGFpbWFudEFnZV9hdF9ET0ksIGRhdGE9YmFzZV9kZV9kYXRvcykKc3VtbWFyeShyZWdyZXNpb24pCmBgYAojIEV2YWx1YXIsIHkgZW4gY2FzbyBuZWNlc2FyaW8sIGFqdXN0YXIgbGEgcmVncmVzaW9uIApgYGB7cn0KbGlicmFyeShjYXIpCnJlZ3Jlc2lvbiA8LSBsbShUb3RhbEluY3VycmVkQ29zdCB+IFRvdGFsUmVzZXJ2ZXMgKyBQcm9jZXNpbmdfVGltZSArIFRvdGFsUmVjb3ZlcnkgICsgQ2xhaW1hbnRBZ2VfYXRfRE9JICwgZGF0YT1iYXNlX2RlX2RhdG9zKQpzdW1tYXJ5KHJlZ3Jlc2lvbikKdmlmKHJlZ3Jlc2lvbikKYGBgCiMgQ29uc3RydWlyIHVuIG1vZGVsbyBkZSBwcmVkaWNjaW9uCmBgYHtyfQpkYXRvc19udWV2b3MgPC0gZGF0YS5mcmFtZShUb3RhbFJlc2VydmVzPTUwMDAsIFByb2Nlc2luZ19UaW1lPTYwLCBUb3RhbFJlY292ZXJ5PTIwMTMsIENsYWltYW50QWdlX2F0X0RPSSA9NjIpCnByZWRpY3QocmVncmVzaW9uLGRhdG9zX251ZXZvcykKYGBgClVuIG1vZGVsbyBkZSBwcmVkaWNjacOzbiB1c2EgdW5hIHJlZ3Jlc2nDs24gbGluZWFsIHBhcmEgZXN0aW1hciBmdXR1cm9zIHJlc3VsdGFkb3MsIHBhcmEgcG9kZXJsbyBldmFsdWFyIG5lY2VzaXRhbW9zIGVuY29udHJhciBjdWFsZXMgc29uIGxhcyB2YXJpYWJsZXMgZXN0YWTDrXN0aWNhbWVudGUgc2lnbmlmaWNhdGl2YXMgcXVlIGF5dWRhbiBhIHByZWRlY2lyIG51ZXN0cmEgdmFyaWFibGUgZGVwZW5kaWVudGUsIHRhbWJpw6luIGVzIGltcG9ydGFudGUgdGVuZXIgZW4gY3VlbnRhIHF1ZSBwdWVkZW4gZXhpc3RpciBvdHJvcyBwcm9ibGVtYXMgY29tbyBsYSBtdWx0aWNvbGluZWFsaWRhZCBxdWUgcG9kcsOhbiBhZmVjdGFyIGFsIG1vZGVsbyBwcmVkaWN0aXZvIHlhIHF1ZSBoYXkgdmFyaWFzIHZhcmlhYmxlcyBxdWUgc2UgZXhwbGljYW4gZW50cmUgc8OtIGNvbW8gZW4gZWwgbW9kZWxvIDEsIHNpIG5vc290cm9zIGVsaW1pbmFtb3MgZXN0YXMgdmFyaWFibGVzIHRlbmRyZW1vcyB1biBtb2RlbG8gcXVlIHNpIHNlcsOhIGNhcGF6IGRlIGhhY2VyIHByZWRpY2Npb25lcyB5IGVuIGNhc28gZGUgcXVlIHF1ZXJlbW9zIHF1ZSBzZWEgbcOhcyBlZmVjdGl2byBlbiBjdWFudG8gYSBzdSBSIGN1YWRyYWRhLCBuZWNlc2l0YXJlbW9zIGJ1c2NhciBtw6FzIHZhcmlhYmxlcyBleHBsaWNhdGl2YXMgcXVlIG5vIGVzdMOpbiBjb3JyZWxhY2lvbmFkYXMgZW50cmUgc8OtLgo=