Importando librerias
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readxl)
## Warning: package 'readxl' was built under R version 4.1.3
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
library(ggplot2)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.1.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
Importando la base de datos
quejas <- read_excel("C:/Users/maxwi/Desktop/ClaimsData2018.xlsx")
resumen <- read_excel("C:/Users/maxwi/Desktop/TransactionsSummary2018.xlsx")
Uniendo las bases de datos
# Uniendo las bases de datos a partir de la columna ClaimsID
bdc <- merge(quejas, resumen, by= "ClaimID", all= TRUE)
#Base de datos sólo hombres
bdc_hombres <- subset(bdc, Gender == "Male")
summary(bdc_hombres)
## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Length:65125 Length:65125 Length:65125
## 1st Qu.: 811585 Class :character Class :character Class :character
## Median : 845656 Mode :character Mode :character Mode :character
## Mean :10174362
## 3rd Qu.:22721079
## Max. :62203891
##
## IndemnityPaid OtherPaid ClaimStatus
## Length:65125 Length:65125 Length:65125
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## IncidentDate IncidentDescription
## Min. :1969-06-02 00:00:00 Length:65125
## 1st Qu.:1999-08-16 00:00:00 Class :character
## Median :2004-03-10 00:00:00 Mode :character
## Mean :2004-04-07 12:27:37
## 3rd Qu.:2009-01-27 00:00:00
## Max. :2014-06-27 00:00:00
##
## ReturnToWorkDate AverageWeeklyWage ClaimantOpenedDate
## Min. :1976-10-29 00:00:00 Length:65125 Min. :1980-07-03 00:00:00
## 1st Qu.:2001-03-07 00:00:00 Class :character 1st Qu.:1999-09-27 00:00:00
## Median :2007-04-05 00:00:00 Mode :character Median :2004-04-22 00:00:00
## Mean :2006-03-06 11:39:51 Mean :2004-05-25 16:07:39
## 3rd Qu.:2011-05-28 00:00:00 3rd Qu.:2009-04-02 00:00:00
## Max. :2014-12-13 00:00:00 Max. :2014-06-30 00:00:00
## NA's :28982
## ClaimantClosedDate EmployerNotificationDate
## Min. :1999-06-01 00:00:00 Min. :1972-09-10 00:00:00
## 1st Qu.:2005-03-31 00:00:00 1st Qu.:1999-11-08 00:00:00
## Median :2006-05-04 00:00:00 Median :2004-08-27 00:00:00
## Mean :2007-07-21 03:06:41 Mean :2005-12-16 20:44:36
## 3rd Qu.:2009-12-17 00:00:00 3rd Qu.:2009-09-17 00:00:00
## Max. :2014-06-30 00:00:00 Max. :9999-07-21 00:00:00
## NA's :2316 NA's :11554
## ReceivedDate IsDenied ClaimantAge_at_DOI
## Min. :1980-07-03 00:00:00 Length:65125 Length:65125
## 1st Qu.:1999-10-04 00:00:00 Class :character Class :character
## Median :2004-04-19 00:00:00 Mode :character Mode :character
## Mean :2004-12-30 06:26:39
## 3rd Qu.:2009-02-20 00:00:00
## Max. :9999-01-21 00:00:00
##
## Gender ClaimantType InjuryNature BodyPartRegion
## Length:65125 Length:65125 Length:65125 Length:65125
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BodyPart BillReviewALE Hospital PhysicianOutpatient
## Length:65125 Min. : -80.00 Min. : -22.9 Min. : -549.5
## Class :character 1st Qu.: 8.25 1st Qu.: 202.4 1st Qu.: 107.5
## Mode :character Median : 24.00 Median : 582.1 Median : 223.2
## Mean : 200.47 Mean : 4916.0 Mean : 1759.4
## 3rd Qu.: 65.64 3rd Qu.: 2337.9 3rd Qu.: 695.3
## Max. :56475.30 Max. :2759604.0 Max. :1219766.6
## NA's :53799 NA's :55514 NA's :41065
## Rx
## Min. : -469.5
## 1st Qu.: 22.1
## Median : 59.6
## Mean : 1637.2
## 3rd Qu.: 185.5
## Max. :631635.5
## NA's :56154
Cambiando el tipo de variable
#Cambiando variable a numérica
bdc_hombres$ClaimID <- as.numeric(bdc_hombres$ClaimID)
bdc_hombres$TotalPaid <- as.numeric(bdc_hombres$TotalPaid)
bdc_hombres$TotalReserves <- as.numeric(bdc_hombres$TotalReserves)
bdc_hombres$TotalRecovery <- as.numeric(bdc_hombres$TotalRecovery)
bdc_hombres$IndemnityPaid <- as.numeric(bdc_hombres$IndemnityPaid)
bdc_hombres$OtherPaid <- as.numeric(bdc_hombres$OtherPaid)
bdc_hombres$AverageWeeklyWage <- as.numeric(bdc_hombres$AverageWeeklyWage)
## Warning: NAs introducidos por coerción
Seleccionando las variables para el modelo
regresion <- lm(ClaimID ~ TotalPaid + TotalRecovery + AverageWeeklyWage + ClaimantAge_at_DOI + BillReviewALE + Hospital + Rx, data=bdc_hombres)
summary(regresion)
##
## Call:
## lm(formula = ClaimID ~ TotalPaid + TotalRecovery + AverageWeeklyWage +
## ClaimantAge_at_DOI + BillReviewALE + Hospital + Rx, data = bdc_hombres)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16765469 -47310 -2049 45050 26900994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.234e+05 1.491e+06 0.485 0.628
## TotalPaid 2.344e-01 8.521e-01 0.275 0.783
## TotalRecovery -2.383e+00 2.725e+01 -0.087 0.930
## AverageWeeklyWage 2.054e+02 1.305e+02 1.574 0.116
## ClaimantAge_at_DOI12 -3.022e+04 2.109e+06 -0.014 0.989
## ClaimantAge_at_DOI18 3.627e+04 2.103e+06 0.017 0.986
## ClaimantAge_at_DOI19 8.321e+03 1.664e+06 0.005 0.996
## ClaimantAge_at_DOI2 7.096e+04 2.104e+06 0.034 0.973
## ClaimantAge_at_DOI20 1.584e+05 1.729e+06 0.092 0.927
## ClaimantAge_at_DOI21 2.453e+03 1.591e+06 0.002 0.999
## ClaimantAge_at_DOI22 9.335e+04 1.629e+06 0.057 0.954
## ClaimantAge_at_DOI23 2.667e+04 1.549e+06 0.017 0.986
## ClaimantAge_at_DOI24 4.535e+04 1.559e+06 0.029 0.977
## ClaimantAge_at_DOI25 -1.407e+03 1.608e+06 -0.001 0.999
## ClaimantAge_at_DOI26 4.376e+04 1.606e+06 0.027 0.978
## ClaimantAge_at_DOI27 -1.118e+04 1.549e+06 -0.007 0.994
## ClaimantAge_at_DOI28 2.206e+04 1.549e+06 0.014 0.989
## ClaimantAge_at_DOI29 -2.183e+04 1.537e+06 -0.014 0.989
## ClaimantAge_at_DOI30 1.944e+06 1.535e+06 1.267 0.206
## ClaimantAge_at_DOI31 2.541e+04 1.549e+06 0.016 0.987
## ClaimantAge_at_DOI32 -8.775e+03 1.523e+06 -0.006 0.995
## ClaimantAge_at_DOI33 -6.663e+03 1.519e+06 -0.004 0.997
## ClaimantAge_at_DOI34 2.908e+03 1.537e+06 0.002 0.998
## ClaimantAge_at_DOI35 -8.883e+02 1.533e+06 -0.001 1.000
## ClaimantAge_at_DOI36 -2.186e+04 1.521e+06 -0.014 0.989
## ClaimantAge_at_DOI37 -2.185e+04 1.542e+06 -0.014 0.989
## ClaimantAge_at_DOI38 2.441e+03 1.606e+06 0.002 0.999
## ClaimantAge_at_DOI39 -4.026e+04 1.536e+06 -0.026 0.979
## ClaimantAge_at_DOI40 -1.158e+04 1.534e+06 -0.008 0.994
## ClaimantAge_at_DOI41 4.583e+03 1.536e+06 0.003 0.998
## ClaimantAge_at_DOI42 -1.646e+04 1.525e+06 -0.011 0.991
## ClaimantAge_at_DOI43 -2.816e+04 1.541e+06 -0.018 0.985
## ClaimantAge_at_DOI44 -2.724e+04 1.541e+06 -0.018 0.986
## ClaimantAge_at_DOI45 -1.622e+04 1.527e+06 -0.011 0.992
## ClaimantAge_at_DOI46 -5.115e+04 1.555e+06 -0.033 0.974
## ClaimantAge_at_DOI47 -1.518e+05 1.551e+06 -0.098 0.922
## ClaimantAge_at_DOI48 -4.413e+04 1.527e+06 -0.029 0.977
## ClaimantAge_at_DOI49 -3.729e+04 1.537e+06 -0.024 0.981
## ClaimantAge_at_DOI50 -6.026e+04 1.524e+06 -0.040 0.968
## ClaimantAge_at_DOI51 -3.083e+04 1.515e+06 -0.020 0.984
## ClaimantAge_at_DOI52 -3.121e+04 1.537e+06 -0.020 0.984
## ClaimantAge_at_DOI53 -2.037e+04 1.521e+06 -0.013 0.989
## ClaimantAge_at_DOI54 -5.700e+04 1.541e+06 -0.037 0.971
## ClaimantAge_at_DOI55 -5.157e+04 1.555e+06 -0.033 0.974
## ClaimantAge_at_DOI56 8.371e+05 1.548e+06 0.541 0.589
## ClaimantAge_at_DOI57 -4.547e+04 1.561e+06 -0.029 0.977
## ClaimantAge_at_DOI58 -1.921e+04 1.607e+06 -0.012 0.990
## ClaimantAge_at_DOI59 -5.115e+04 1.632e+06 -0.031 0.975
## ClaimantAge_at_DOI60 -6.493e+04 1.573e+06 -0.041 0.967
## ClaimantAge_at_DOI61 1.697e+06 1.607e+06 1.056 0.291
## ClaimantAge_at_DOI62 6.806e+03 1.577e+06 0.004 0.997
## ClaimantAge_at_DOI63 5.024e+04 2.109e+06 0.024 0.981
## ClaimantAge_at_DOI65 -1.919e+05 1.720e+06 -0.112 0.911
## ClaimantAge_at_DOI66 2.594e+04 2.110e+06 0.012 0.990
## ClaimantAge_at_DOI68 1.684e+07 1.822e+06 9.239 <2e-16 ***
## ClaimantAge_at_DOI71 1.398e+04 2.101e+06 0.007 0.995
## ClaimantAge_at_DOI76 -6.103e+04 2.104e+06 -0.029 0.977
## ClaimantAge_at_DOI80 -2.315e+04 2.102e+06 -0.011 0.991
## ClaimantAge_at_DOI82 8.485e+04 2.104e+06 0.040 0.968
## ClaimantAge_at_DOINULL -1.527e+04 1.492e+06 -0.010 0.992
## BillReviewALE 9.382e+00 2.863e+01 0.328 0.743
## Hospital -5.126e-01 2.117e+00 -0.242 0.809
## Rx -1.580e+00 3.431e+00 -0.461 0.645
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1485000 on 691 degrees of freedom
## (64371 observations deleted due to missingness)
## Multiple R-squared: 0.2976, Adjusted R-squared: 0.2346
## F-statistic: 4.722 on 62 and 691 DF, p-value: < 2.2e-16
Corriendo la predicción
datos <- data.frame(TotalPaid=9295.89, TotalRecovery=0, AverageWeeklyWage=639.59, ClaimantAge_at_DOI="61", BillReviewALE=74.68, Hospital=106.53, Rx=56.52)
predict(regresion,datos)
## 1
## 2554675