#file.choose()
base_de_datos <- read.csv("/Users/agustingomezperez/Desktop/Inteligencia Artificial/seguros (1).csv")
library(corrplot)
## corrplot 0.92 loaded
resumen <- summary(base_de_datos)
resumen
## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 777632 Min. : 0 Min. : 0 Min. : 0.00
## 1st Qu.: 800748 1st Qu.: 83 1st Qu.: 0 1st Qu.: 0.00
## Median : 812128 Median : 271 Median : 0 Median : 0.00
## Mean : 1864676 Mean : 10404 Mean : 3368 Mean : 66.05
## 3rd Qu.: 824726 3rd Qu.: 1122 3rd Qu.: 0 3rd Qu.: 0.00
## Max. :62203364 Max. :4527291 Max. :1529053 Max. :100000.00
##
## IndemnityPaid OtherPaid TotalIncurredCost ClaimStatus
## Min. : 0 Min. : 0 Min. : -10400 Length:31619
## 1st Qu.: 0 1st Qu.: 80 1st Qu.: 80 Class :character
## Median : 0 Median : 265 Median : 266 Mode :character
## Mean : 4977 Mean : 5427 Mean : 13706
## 3rd Qu.: 0 3rd Qu.: 1023 3rd Qu.: 1098
## Max. :640732 Max. :4129915 Max. :4734750
##
## IncidentDate IncidentDescription ReturnToWorkDate ClaimantOpenedDate
## Length:31619 Length:31619 Length:31619 Length:31619
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## ClaimantClosedDate EmployerNotificationDate ReceivedDate
## Length:31619 Length:31619 Length:31619
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## IsDenied Transaction_Time Procesing_Time ClaimantAge_at_DOI
## Min. :0.00000 Min. : 0 Min. : 0.00 Min. :14.0
## 1st Qu.:0.00000 1st Qu.: 211 1st Qu.: 4.00 1st Qu.:33.0
## Median :0.00000 Median : 780 Median : 10.00 Median :42.0
## Mean :0.04463 Mean : 1004 Mean : 62.99 Mean :41.6
## 3rd Qu.:0.00000 3rd Qu.: 1440 3rd Qu.: 24.00 3rd Qu.:50.0
## Max. :1.00000 Max. :16428 Max. :11558.00 Max. :94.0
## NA's :614
## Gender ClaimantType InjuryNature BodyPartRegion
## Length:31619 Length:31619 Length:31619 Length:31619
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BodyPart AverageWeeklyWage1 ClaimID1 BillReviewALE
## Length:31619 Min. : 100.0 Min. : 777632 Min. : -448.0
## Class :character 1st Qu.: 492.0 1st Qu.: 800748 1st Qu.: 16.0
## Mode :character Median : 492.0 Median : 812128 Median : 24.0
## Mean : 536.5 Mean : 1864676 Mean : 188.7
## 3rd Qu.: 492.0 3rd Qu.: 824726 3rd Qu.: 64.1
## Max. :8613.5 Max. :62203364 Max. :46055.3
## NA's :14912
## Hospital PhysicianOutpatient Rx
## Min. : -12570.4 Min. : -549.5 Min. : -160.7
## 1st Qu.: 210.5 1st Qu.: 105.8 1st Qu.: 22.9
## Median : 613.9 Median : 218.0 Median : 61.5
## Mean : 5113.2 Mean : 1813.2 Mean : 1695.2
## 3rd Qu.: 2349.1 3rd Qu.: 680.6 3rd Qu.: 189.0
## Max. :2759604.0 Max. :1219766.6 Max. :631635.5
## NA's :19655 NA's :2329 NA's :20730
#Transformar y encontrar correlación
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
base_de_datos<-base_de_datos %>% select_if(is.numeric)
base_de_datos<-base_de_datos[, colSums(is.na(base_de_datos)) == 0]
corrplot(cor(base_de_datos),method='color',addCoef.col='black', order='FPC')
# Generar regresion (modelo lineal)
regresion <- lm(TotalIncurredCost ~ TotalPaid + IndemnityPaid + OtherPaid + TotalReserves + Procesing_Time + TotalRecovery + AverageWeeklyWage1 + ClaimantAge_at_DOI, data=base_de_datos)
summary(regresion)
##
## Call:
## lm(formula = TotalIncurredCost ~ TotalPaid + IndemnityPaid +
## OtherPaid + TotalReserves + Procesing_Time + TotalRecovery +
## AverageWeeklyWage1 + ClaimantAge_at_DOI, data = base_de_datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.145e-08 -4.000e-11 -2.000e-11 -1.000e-11 5.739e-07
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.138e-10 8.651e-11 4.783e+00 1.74e-06 ***
## TotalPaid 1.000e+00 6.298e-16 1.588e+15 < 2e-16 ***
## IndemnityPaid -7.199e-14 1.408e-15 -5.113e+01 < 2e-16 ***
## OtherPaid NA NA NA NA
## TotalReserves 1.000e+00 6.693e-16 1.494e+15 < 2e-16 ***
## Procesing_Time 2.635e-13 7.521e-14 3.503e+00 0.00046 ***
## TotalRecovery -1.000e+00 1.554e-14 -6.435e+13 < 2e-16 ***
## AverageWeeklyWage1 2.437e-14 9.297e-14 2.620e-01 0.79322
## ClaimantAge_at_DOI -1.465e-12 1.803e-12 -8.130e-01 0.41637
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.608e-09 on 31611 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.464e+30 on 7 and 31611 DF, p-value: < 2.2e-16
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
regresion <- lm(TotalIncurredCost ~ TotalReserves + Procesing_Time + TotalRecovery + ClaimantAge_at_DOI , data=base_de_datos)
summary(regresion)
##
## Call:
## lm(formula = TotalIncurredCost ~ TotalReserves + Procesing_Time +
## TotalRecovery + ClaimantAge_at_DOI, data = base_de_datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -744596 -7023 -4820 -2505 4314887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.871e+03 9.690e+02 -2.963 0.00305 **
## TotalReserves 1.712e+00 7.273e-03 235.354 < 2e-16 ***
## Procesing_Time 4.582e+01 8.844e-01 51.805 < 2e-16 ***
## TotalRecovery 1.320e+00 1.951e-01 6.765 1.35e-11 ***
## ClaimantAge_at_DOI 1.884e+02 2.246e+01 8.388 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45470 on 31614 degrees of freedom
## Multiple R-squared: 0.7089, Adjusted R-squared: 0.7088
## F-statistic: 1.924e+04 on 4 and 31614 DF, p-value: < 2.2e-16
vif(regresion)
## TotalReserves Procesing_Time TotalRecovery ClaimantAge_at_DOI
## 1.147592 1.139177 1.008341 1.001373
datos_nuevos <- data.frame(TotalReserves=5000, Procesing_Time=60, TotalRecovery=2013, ClaimantAge_at_DOI =62)
predict(regresion,datos_nuevos)
## 1
## 22777.27
Un modelo de predicción usa una regresión lineal para estimar futuros resultados, para poderlo evaluar necesitamos encontrar cuales son las variables estadísticamente significativas que ayudan a predecir nuestra variable dependiente, también es importante tener en cuenta que pueden existir otros problemas como la multicolinealidad que podrán afectar al modelo predictivo ya que hay varias variables que se explican entre sí como en el modelo 1, si nosotros eliminamos estas variables tendremos un modelo que si será capaz de hacer predicciones y en caso de que queremos que sea más efectivo en cuanto a su R cuadrada, necesitaremos buscar más variables explicativas que no estén correlacionadas entre sí.