Extraer la Base de Datos

# file.choose()
bd <- read.csv("/Users/danielatrevino/Downloads/Claims_limpia.csv")
summary(bd)
##     ClaimID           TotalPaid       TotalReserves     TotalRecovery      
##  Min.   :  650915   Min.   :   -270   Min.   :      0   Min.   :     0.00  
##  1st Qu.:  811125   1st Qu.:     60   1st Qu.:      0   1st Qu.:     0.00  
##  Median :  844626   Median :    235   Median :      0   Median :     0.00  
##  Mean   :10149151   Mean   :   6746   Mean   :   2233   Mean   :    68.88  
##  3rd Qu.:22716506   3rd Qu.:    938   3rd Qu.:      0   3rd Qu.:     0.00  
##  Max.   :62203891   Max.   :4527291   Max.   :2069575   Max.   :130541.03  
##                                                                            
##  IndemnityPaid      OtherPaid       ClaimStatus        IncidentDate      
##  Min.   :  -475   Min.   :  -7820   Length:134004      Length:134004     
##  1st Qu.:     0   1st Qu.:     58   Class :character   Class :character  
##  Median :     0   Median :    230   Mode  :character   Mode  :character  
##  Mean   :  3061   Mean   :   3685                                        
##  3rd Qu.:     0   3rd Qu.:    855                                        
##  Max.   :640732   Max.   :4129915                                        
##                                                                          
##  IncidentDescription AverageWeeklyWage  ReceivedDate          IsDenied      
##  Length:134004       Length:134004      Length:134004      Min.   :0.00000  
##  Class :character    Class :character   Class :character   1st Qu.:0.00000  
##  Mode  :character    Mode  :character   Mode  :character   Median :0.00000  
##                                                            Mean   :0.04474  
##                                                            3rd Qu.:0.00000  
##                                                            Max.   :1.00000  
##                                                                             
##     Gender          ClaimantType       InjuryNature       BodyPartRegion    
##  Length:134004      Length:134004      Length:134004      Length:134004     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    BodyPart             semana      ClaimantClosedDate2 ClaimantOpenedDate2
##  Length:134004      Min.   : 0.00   Min.   : 1.00       Min.   : 1.00      
##  Class :character   1st Qu.:13.00   1st Qu.:13.00       1st Qu.:14.00      
##  Mode  :character   Median :25.00   Median :14.00       Median :26.00      
##                     Mean   :25.62   Mean   :21.55       Mean   :25.95      
##                     3rd Qu.:38.00   3rd Qu.:30.00       3rd Qu.:38.00      
##                     Max.   :53.00   Max.   :53.00       Max.   :53.00      
##                     NA's   :58637   NA's   :4678                           
##  EmployerNotificationDate2 ReturnToWorkDate2 Total_Incurred_Cost_Claim
##  Min.   : 1.00             Min.   : 1.00     Min.   : -11775          
##  1st Qu.:14.00             1st Qu.:14.00     1st Qu.:     59          
##  Median :25.00             Median :26.00     Median :    234          
##  Mean   :25.77             Mean   :26.02     Mean   :   8910          
##  3rd Qu.:38.00             3rd Qu.:38.00     3rd Qu.:    965          
##  Max.   :53.00             Max.   :53.00     Max.   :5054823          
##  NA's   :22288             NA's   :58637

Analizar la Base de Datos

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
count(bd, IsDenied, sort= TRUE)
##   IsDenied      n
## 1        0 128008
## 2        1   5996
count(bd, Gender, sort=TRUE)
##          Gender     n
## 1          Male 65125
## 2        Female 59197
## 3 Not Available  9682
count(bd, ClaimantType, sort = TRUE)
##   ClaimantType     n
## 1 Medical Only 87943
## 2    Indemnity 33726
## 3  Report Only 12335
count(bd, BodyPartRegion, sort = TRUE)
##        BodyPartRegion     n
## 1   Upper Extremities 38952
## 2   Lower Extremities 27609
## 3               Trunk 21223
## 4 Multiple Body Parts 16380
## 5                Head 12832
## 6   Non-Standard Code 12651
## 7                Neck  4349
## 8       Not Available     8

Extraer las variables de interƩs

transaction <- bd[,c("IsDenied", "Gender", "ClaimantType", "BodyPartRegion")]

Convertir las variables categóricas en factores

transaction$IsDenied<- as.factor(ifelse(transaction$IsDenied==0, "Aprobada", "Denegada"))
transaction$Gender <- as.factor(transaction$Gender)
transaction$ClaimantType <- as.factor(transaction$ClaimantType)
transaction$BodyPartRegion <- as.factor(transaction$BodyPartRegion)

str(transaction)
## 'data.frame':    134004 obs. of  4 variables:
##  $ IsDenied      : Factor w/ 2 levels "Aprobada","Denegada": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Gender        : Factor w/ 3 levels "Female","Male",..: 2 2 2 2 1 2 1 2 1 2 ...
##  $ ClaimantType  : Factor w/ 3 levels "Indemnity","Medical Only",..: 1 2 1 2 1 2 2 1 2 1 ...
##  $ BodyPartRegion: Factor w/ 8 levels "Head","Lower Extremities",..: 7 7 8 8 7 2 7 8 3 3 ...
summary(transaction)
##      IsDenied                Gender            ClaimantType  
##  Aprobada:128008   Female       :59197   Indemnity   :33726  
##  Denegada:  5996   Male         :65125   Medical Only:87943  
##                    Not Available: 9682   Report Only :12335  
##                                                              
##                                                              
##                                                              
##                                                              
##              BodyPartRegion 
##  Upper Extremities  :38952  
##  Lower Extremities  :27609  
##  Trunk              :21223  
##  Multiple Body Parts:16380  
##  Head               :12832  
##  Non-Standard Code  :12651  
##  (Other)            : 4357

Crear el arbol de decisión

# install.packages("rpart")
library(rpart)
# install.packages("rpart.plot")
library(rpart.plot)

arbol <- rpart(formula = Gender ~ ., data = transaction)

arbol
## n= 134004 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 134004 68879 Male (0.441755470 0.485992955 0.072251575)  
##    2) BodyPartRegion=Head,Lower Extremities,Multiple Body Parts,Neck,Not Available,Trunk,Upper Extremities 121353 59449 Male (0.484001220 0.510115119 0.005883662)  
##      4) BodyPartRegion=Lower Extremities,Multiple Body Parts,Neck,Not Available,Upper Extremities 87298 43365 Female (0.503253225 0.491202548 0.005544228)  
##        8) IsDenied=Denegada 4491  1874 Female (0.582720998 0.413048319 0.004230684) *
##        9) IsDenied=Aprobada 82807 41491 Female (0.498943326 0.495441207 0.005615467)  
##         18) BodyPartRegion=Lower Extremities,Multiple Body Parts 41527 19959 Female (0.519372938 0.476003564 0.004623498) *
##         19) BodyPartRegion=Neck,Not Available,Upper Extremities 41280 20021 Male (0.478391473 0.514995155 0.006613372) *
##      5) BodyPartRegion=Head,Trunk 34055 15032 Male (0.434649831 0.558596388 0.006753781) *
##    3) BodyPartRegion=Non-Standard Code 12651  3683 Not Available (0.036518852 0.254604379 0.708876769) *
rpart.plot(arbol)

arbol <- rpart(formula = ClaimantType ~ ., data = transaction)

arbol
## n= 134004 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 134004 46061 Medical Only (0.25167905 0.65627145 0.09204949)  
##   2) BodyPartRegion=Head,Lower Extremities,Multiple Body Parts,Neck,Trunk,Upper Extremities 121345 35669 Medical Only (0.23339239 0.70605299 0.06055462) *
##   3) BodyPartRegion=Non-Standard Code,Not Available 12659  7254 Indemnity (0.42696895 0.17908208 0.39394897) *
rpart.plot(arbol)

arbol <- rpart(formula = BodyPartRegion ~ ., data = transaction)

arbol
## n= 134004 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 134004 95052 Upper Extremities (0.096 0.21 0.12 0.032 0.094 6e-05 0.16 0.29)  
##   2) Gender=Female,Male 124322 85613 Upper Extremities (0.1 0.22 0.13 0.035 0.03 6.4e-05 0.17 0.31) *
##   3) Gender=Not Available 9682   714 Non-Standard Code (0.0096 0.017 0.0033 0.0041 0.93 0 0.014 0.025) *
rpart.plot(arbol)

arbol <- rpart(formula = IsDenied ~ ., data = transaction)

arbol
## n= 134004 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 134004 5996 Aprobada (0.95525507 0.04474493) *
rpart.plot(arbol)