Importando librerias

library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readxl)
## Warning: package 'readxl' was built under R version 4.1.3
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
library(ggplot2)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.1.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Importando la base de datos

quejas <- read_excel("C:/Users/maxwi/Desktop/ClaimsData2018.xlsx")

resumen <- read_excel("C:/Users/maxwi/Desktop/TransactionsSummary2018.xlsx")

Uniendo las bases de datos

# Uniendo las bases de datos a partir de la columna ClaimsID
bdc <- merge(quejas, resumen, by= "ClaimID", all= TRUE)
#Base de datos sólo hombres
bdc_hombres <- subset(bdc, Gender == "Male")
summary(bdc_hombres)
##     ClaimID          TotalPaid         TotalReserves      TotalRecovery     
##  Min.   :  650915   Length:65125       Length:65125       Length:65125      
##  1st Qu.:  811585   Class :character   Class :character   Class :character  
##  Median :  845656   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :10174362                                                           
##  3rd Qu.:22721079                                                           
##  Max.   :62203891                                                           
##                                                                             
##  IndemnityPaid       OtherPaid         ClaimStatus       
##  Length:65125       Length:65125       Length:65125      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   IncidentDate                 IncidentDescription
##  Min.   :1969-06-02 00:00:00   Length:65125       
##  1st Qu.:1999-08-16 00:00:00   Class :character   
##  Median :2004-03-10 00:00:00   Mode  :character   
##  Mean   :2004-04-07 12:27:37                      
##  3rd Qu.:2009-01-27 00:00:00                      
##  Max.   :2014-06-27 00:00:00                      
##                                                   
##  ReturnToWorkDate              AverageWeeklyWage  ClaimantOpenedDate           
##  Min.   :1976-10-29 00:00:00   Length:65125       Min.   :1980-07-03 00:00:00  
##  1st Qu.:2001-03-07 00:00:00   Class :character   1st Qu.:1999-09-27 00:00:00  
##  Median :2007-04-05 00:00:00   Mode  :character   Median :2004-04-22 00:00:00  
##  Mean   :2006-03-06 11:39:51                      Mean   :2004-05-25 16:07:39  
##  3rd Qu.:2011-05-28 00:00:00                      3rd Qu.:2009-04-02 00:00:00  
##  Max.   :2014-12-13 00:00:00                      Max.   :2014-06-30 00:00:00  
##  NA's   :28982                                                                 
##  ClaimantClosedDate            EmployerNotificationDate     
##  Min.   :1999-06-01 00:00:00   Min.   :1972-09-10 00:00:00  
##  1st Qu.:2005-03-31 00:00:00   1st Qu.:1999-11-08 00:00:00  
##  Median :2006-05-04 00:00:00   Median :2004-08-27 00:00:00  
##  Mean   :2007-07-21 03:06:41   Mean   :2005-12-16 20:44:36  
##  3rd Qu.:2009-12-17 00:00:00   3rd Qu.:2009-09-17 00:00:00  
##  Max.   :2014-06-30 00:00:00   Max.   :9999-07-21 00:00:00  
##  NA's   :2316                  NA's   :11554                
##   ReceivedDate                   IsDenied         ClaimantAge_at_DOI
##  Min.   :1980-07-03 00:00:00   Length:65125       Length:65125      
##  1st Qu.:1999-10-04 00:00:00   Class :character   Class :character  
##  Median :2004-04-19 00:00:00   Mode  :character   Mode  :character  
##  Mean   :2004-12-30 06:26:39                                        
##  3rd Qu.:2009-02-20 00:00:00                                        
##  Max.   :9999-01-21 00:00:00                                        
##                                                                     
##     Gender          ClaimantType       InjuryNature       BodyPartRegion    
##  Length:65125       Length:65125       Length:65125       Length:65125      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    BodyPart         BillReviewALE         Hospital         PhysicianOutpatient
##  Length:65125       Min.   :  -80.00   Min.   :    -22.9   Min.   :   -549.5  
##  Class :character   1st Qu.:    8.25   1st Qu.:    202.4   1st Qu.:    107.5  
##  Mode  :character   Median :   24.00   Median :    582.1   Median :    223.2  
##                     Mean   :  200.47   Mean   :   4916.0   Mean   :   1759.4  
##                     3rd Qu.:   65.64   3rd Qu.:   2337.9   3rd Qu.:    695.3  
##                     Max.   :56475.30   Max.   :2759604.0   Max.   :1219766.6  
##                     NA's   :53799      NA's   :55514       NA's   :41065      
##        Rx          
##  Min.   :  -469.5  
##  1st Qu.:    22.1  
##  Median :    59.6  
##  Mean   :  1637.2  
##  3rd Qu.:   185.5  
##  Max.   :631635.5  
##  NA's   :56154

Cambiando el tipo de variable

#Cambiando variable a numérica
bdc_hombres$ClaimID <- as.numeric(bdc_hombres$ClaimID)
bdc_hombres$TotalPaid <- as.numeric(bdc_hombres$TotalPaid)
bdc_hombres$TotalReserves <- as.numeric(bdc_hombres$TotalReserves)
bdc_hombres$TotalRecovery <- as.numeric(bdc_hombres$TotalRecovery)
bdc_hombres$IndemnityPaid <- as.numeric(bdc_hombres$IndemnityPaid)
bdc_hombres$OtherPaid <- as.numeric(bdc_hombres$OtherPaid)
bdc_hombres$AverageWeeklyWage <- as.numeric(bdc_hombres$AverageWeeklyWage)
## Warning: NAs introducidos por coerción

Seleccionando las variables para el modelo

regresion <- lm(ClaimID ~ TotalPaid + TotalRecovery + AverageWeeklyWage + ClaimantAge_at_DOI + BillReviewALE + Hospital + Rx, data=bdc_hombres)
summary(regresion)
## 
## Call:
## lm(formula = ClaimID ~ TotalPaid + TotalRecovery + AverageWeeklyWage + 
##     ClaimantAge_at_DOI + BillReviewALE + Hospital + Rx, data = bdc_hombres)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -16765469    -47310     -2049     45050  26900994 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             7.234e+05  1.491e+06   0.485    0.628    
## TotalPaid               2.344e-01  8.521e-01   0.275    0.783    
## TotalRecovery          -2.383e+00  2.725e+01  -0.087    0.930    
## AverageWeeklyWage       2.054e+02  1.305e+02   1.574    0.116    
## ClaimantAge_at_DOI12   -3.022e+04  2.109e+06  -0.014    0.989    
## ClaimantAge_at_DOI18    3.627e+04  2.103e+06   0.017    0.986    
## ClaimantAge_at_DOI19    8.321e+03  1.664e+06   0.005    0.996    
## ClaimantAge_at_DOI2     7.096e+04  2.104e+06   0.034    0.973    
## ClaimantAge_at_DOI20    1.584e+05  1.729e+06   0.092    0.927    
## ClaimantAge_at_DOI21    2.453e+03  1.591e+06   0.002    0.999    
## ClaimantAge_at_DOI22    9.335e+04  1.629e+06   0.057    0.954    
## ClaimantAge_at_DOI23    2.667e+04  1.549e+06   0.017    0.986    
## ClaimantAge_at_DOI24    4.535e+04  1.559e+06   0.029    0.977    
## ClaimantAge_at_DOI25   -1.407e+03  1.608e+06  -0.001    0.999    
## ClaimantAge_at_DOI26    4.376e+04  1.606e+06   0.027    0.978    
## ClaimantAge_at_DOI27   -1.118e+04  1.549e+06  -0.007    0.994    
## ClaimantAge_at_DOI28    2.206e+04  1.549e+06   0.014    0.989    
## ClaimantAge_at_DOI29   -2.183e+04  1.537e+06  -0.014    0.989    
## ClaimantAge_at_DOI30    1.944e+06  1.535e+06   1.267    0.206    
## ClaimantAge_at_DOI31    2.541e+04  1.549e+06   0.016    0.987    
## ClaimantAge_at_DOI32   -8.775e+03  1.523e+06  -0.006    0.995    
## ClaimantAge_at_DOI33   -6.663e+03  1.519e+06  -0.004    0.997    
## ClaimantAge_at_DOI34    2.908e+03  1.537e+06   0.002    0.998    
## ClaimantAge_at_DOI35   -8.883e+02  1.533e+06  -0.001    1.000    
## ClaimantAge_at_DOI36   -2.186e+04  1.521e+06  -0.014    0.989    
## ClaimantAge_at_DOI37   -2.185e+04  1.542e+06  -0.014    0.989    
## ClaimantAge_at_DOI38    2.441e+03  1.606e+06   0.002    0.999    
## ClaimantAge_at_DOI39   -4.026e+04  1.536e+06  -0.026    0.979    
## ClaimantAge_at_DOI40   -1.158e+04  1.534e+06  -0.008    0.994    
## ClaimantAge_at_DOI41    4.583e+03  1.536e+06   0.003    0.998    
## ClaimantAge_at_DOI42   -1.646e+04  1.525e+06  -0.011    0.991    
## ClaimantAge_at_DOI43   -2.816e+04  1.541e+06  -0.018    0.985    
## ClaimantAge_at_DOI44   -2.724e+04  1.541e+06  -0.018    0.986    
## ClaimantAge_at_DOI45   -1.622e+04  1.527e+06  -0.011    0.992    
## ClaimantAge_at_DOI46   -5.115e+04  1.555e+06  -0.033    0.974    
## ClaimantAge_at_DOI47   -1.518e+05  1.551e+06  -0.098    0.922    
## ClaimantAge_at_DOI48   -4.413e+04  1.527e+06  -0.029    0.977    
## ClaimantAge_at_DOI49   -3.729e+04  1.537e+06  -0.024    0.981    
## ClaimantAge_at_DOI50   -6.026e+04  1.524e+06  -0.040    0.968    
## ClaimantAge_at_DOI51   -3.083e+04  1.515e+06  -0.020    0.984    
## ClaimantAge_at_DOI52   -3.121e+04  1.537e+06  -0.020    0.984    
## ClaimantAge_at_DOI53   -2.037e+04  1.521e+06  -0.013    0.989    
## ClaimantAge_at_DOI54   -5.700e+04  1.541e+06  -0.037    0.971    
## ClaimantAge_at_DOI55   -5.157e+04  1.555e+06  -0.033    0.974    
## ClaimantAge_at_DOI56    8.371e+05  1.548e+06   0.541    0.589    
## ClaimantAge_at_DOI57   -4.547e+04  1.561e+06  -0.029    0.977    
## ClaimantAge_at_DOI58   -1.921e+04  1.607e+06  -0.012    0.990    
## ClaimantAge_at_DOI59   -5.115e+04  1.632e+06  -0.031    0.975    
## ClaimantAge_at_DOI60   -6.493e+04  1.573e+06  -0.041    0.967    
## ClaimantAge_at_DOI61    1.697e+06  1.607e+06   1.056    0.291    
## ClaimantAge_at_DOI62    6.806e+03  1.577e+06   0.004    0.997    
## ClaimantAge_at_DOI63    5.024e+04  2.109e+06   0.024    0.981    
## ClaimantAge_at_DOI65   -1.919e+05  1.720e+06  -0.112    0.911    
## ClaimantAge_at_DOI66    2.594e+04  2.110e+06   0.012    0.990    
## ClaimantAge_at_DOI68    1.684e+07  1.822e+06   9.239   <2e-16 ***
## ClaimantAge_at_DOI71    1.398e+04  2.101e+06   0.007    0.995    
## ClaimantAge_at_DOI76   -6.103e+04  2.104e+06  -0.029    0.977    
## ClaimantAge_at_DOI80   -2.315e+04  2.102e+06  -0.011    0.991    
## ClaimantAge_at_DOI82    8.485e+04  2.104e+06   0.040    0.968    
## ClaimantAge_at_DOINULL -1.527e+04  1.492e+06  -0.010    0.992    
## BillReviewALE           9.382e+00  2.863e+01   0.328    0.743    
## Hospital               -5.126e-01  2.117e+00  -0.242    0.809    
## Rx                     -1.580e+00  3.431e+00  -0.461    0.645    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1485000 on 691 degrees of freedom
##   (64371 observations deleted due to missingness)
## Multiple R-squared:  0.2976, Adjusted R-squared:  0.2346 
## F-statistic: 4.722 on 62 and 691 DF,  p-value: < 2.2e-16

Corriendo la predicción

datos <- data.frame(TotalPaid=9295.89, TotalRecovery=0, AverageWeeklyWage=639.59, ClaimantAge_at_DOI="61", BillReviewALE=74.68, Hospital=106.53, Rx=56.52)
predict(regresion,datos)
##       1 
## 2554675