1. Cargar librarías y base de datos

library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
#file.choose()

Claims <- read_excel("/Users/marianaguevara/Downloads/ClaimsData2018.xlsx")

View(Claims)

2. Entender la base de datos

summary(Claims)
##     ClaimID          TotalPaid         TotalReserves      TotalRecovery     
##  Min.   :  650915   Length:134004      Length:134004      Length:134004     
##  1st Qu.:  811125   Class :character   Class :character   Class :character  
##  Median :  844626   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :10149151                                                           
##  3rd Qu.:22716506                                                           
##  Max.   :62203891                                                           
##                                                                             
##  IndemnityPaid       OtherPaid         ClaimStatus       
##  Length:134004      Length:134004      Length:134004     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   IncidentDate                   IncidentDescription
##  Min.   :1947-02-24 00:00:00.0   Length:134004      
##  1st Qu.:1998-12-21 00:00:00.0   Class :character   
##  Median :2004-01-05 00:00:00.0   Mode  :character   
##  Mean   :2003-12-08 16:48:47.2                      
##  3rd Qu.:2009-02-02 00:00:00.0                      
##  Max.   :2014-06-27 00:00:00.0                      
##                                                     
##  ReturnToWorkDate                 AverageWeeklyWage 
##  Min.   :1976-10-29 00:00:00.00   Length:134004     
##  1st Qu.:2002-04-25 00:00:00.00   Class :character  
##  Median :2007-07-09 00:00:00.00   Mode  :character  
##  Mean   :2006-06-01 15:30:12.00                     
##  3rd Qu.:2011-06-01 00:00:00.00                     
##  Max.   :2015-05-07 00:00:00.00                     
##  NA's   :58637                                      
##  ClaimantOpenedDate              ClaimantClosedDate              
##  Min.   :1947-02-24 00:00:00.0   Min.   :1999-06-01 00:00:00.00  
##  1st Qu.:1999-02-09 00:00:00.0   1st Qu.:2005-03-31 00:00:00.00  
##  Median :2004-02-17 00:00:00.0   Median :2006-04-04 12:00:00.00  
##  Mean   :2004-01-23 11:32:07.5   Mean   :2007-05-24 02:11:30.00  
##  3rd Qu.:2009-04-09 06:00:00.0   3rd Qu.:2009-11-11 00:00:00.00  
##  Max.   :2014-06-30 00:00:00.0   Max.   :2014-06-30 00:00:00.00  
##                                  NA's   :4678                    
##  EmployerNotificationDate         ReceivedDate                  
##  Min.   :1972-09-10 00:00:00.0   Min.   :1947-02-24 00:00:00.0  
##  1st Qu.:2000-03-13 00:00:00.0   1st Qu.:1999-02-09 00:00:00.0  
##  Median :2004-12-28 00:00:00.0   Median :2004-02-13 00:00:00.0  
##  Mean   :2005-08-29 04:49:34.2   Mean   :2004-07-19 11:29:03.0  
##  3rd Qu.:2009-11-03 00:00:00.0   3rd Qu.:2009-02-27 00:00:00.0  
##  Max.   :9999-07-21 00:00:00.0   Max.   :9999-07-21 00:00:00.0  
##  NA's   :22288                                                  
##    IsDenied         ClaimantAge_at_DOI    Gender          ClaimantType      
##  Length:134004      Length:134004      Length:134004      Length:134004     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  InjuryNature       BodyPartRegion       BodyPart        
##  Length:134004      Length:134004      Length:134004     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
## 
head(Claims)
## # A tibble: 6 × 22
##   ClaimID TotalPaid TotalReserves TotalRecovery IndemnityPaid OtherPaid
##     <dbl> <chr>     <chr>         <chr>         <chr>         <chr>    
## 1  650915 11947.55  0.00          0.00          243.65        11703.90 
## 2  650916 0.00      0.00          0.00          0.00          0.00     
## 3  650917 9295.89   0.00          0.00          0.00          9295.89  
## 4  650918 1026.29   0.00          0.00          0.00          1026.29  
## 5  650919 43108.03  0.00          0.00          40000.00      3108.03  
## 6  650920 331.90    0.00          0.00          0.00          331.90   
## # ℹ 16 more variables: ClaimStatus <chr>, IncidentDate <dttm>,
## #   IncidentDescription <chr>, ReturnToWorkDate <dttm>,
## #   AverageWeeklyWage <chr>, ClaimantOpenedDate <dttm>,
## #   ClaimantClosedDate <dttm>, EmployerNotificationDate <dttm>,
## #   ReceivedDate <dttm>, IsDenied <chr>, ClaimantAge_at_DOI <chr>,
## #   Gender <chr>, ClaimantType <chr>, InjuryNature <chr>, BodyPartRegion <chr>,
## #   BodyPart <chr>

3. Limpiar la base de datos

## Elimar columnas 
Claims <- Claims %>%
  select(- ClaimantAge_at_DOI)
#¿Cuantos NA's tengo en la bd?
sum(is.na(Claims))
## [1] 85603
#Cuantos NA's tengo por variable
sapply(Claims, function(x) sum(is.na(x)))
##                  ClaimID                TotalPaid            TotalReserves 
##                        0                        0                        0 
##            TotalRecovery            IndemnityPaid                OtherPaid 
##                        0                        0                        0 
##              ClaimStatus             IncidentDate      IncidentDescription 
##                        0                        0                        0 
##         ReturnToWorkDate        AverageWeeklyWage       ClaimantOpenedDate 
##                    58637                        0                        0 
##       ClaimantClosedDate EmployerNotificationDate             ReceivedDate 
##                     4678                    22288                        0 
##                 IsDenied                   Gender             ClaimantType 
##                        0                        0                        0 
##             InjuryNature           BodyPartRegion                 BodyPart 
##                        0                        0                        0
#Convertir la columna fecha a semanas 
Claims$semana <- as.numeric(strftime(Claims$ClaimantClosedDate, format = "%U"))
Claims$semana <- as.numeric(strftime(Claims$EmployerNotificationDate, format = "%U"))
Claims$semana <- as.numeric(strftime(Claims$ReturnToWorkDate, format = "%U"))

str(Claims)
## tibble [134,004 × 22] (S3: tbl_df/tbl/data.frame)
##  $ ClaimID                 : num [1:134004] 650915 650916 650917 650918 650919 ...
##  $ TotalPaid               : chr [1:134004] "11947.55" "0.00" "9295.89" "1026.29" ...
##  $ TotalReserves           : chr [1:134004] "0.00" "0.00" "0.00" "0.00" ...
##  $ TotalRecovery           : chr [1:134004] "0.00" "0.00" "0.00" "0.00" ...
##  $ IndemnityPaid           : chr [1:134004] "243.65" "0.00" "0.00" "0.00" ...
##  $ OtherPaid               : chr [1:134004] "11703.90" "0.00" "9295.89" "1026.29" ...
##  $ ClaimStatus             : chr [1:134004] "C" "C" "C" "C" ...
##  $ IncidentDate            : POSIXct[1:134004], format: "2009-06-17" "2009-06-26" ...
##  $ IncidentDescription     : chr [1:134004] "Employee was moving concrete rings and installing a meter. He strained lower back." "Employee was pulling lining. He felt a pop in the back causing a strain." "Employee was in the restroom. He heard a scream from another restroom that startled him and he fell on his left"| __truncated__ "Employee was unloading truck using a pallet jack to unload heavy equipment. Heavy load caused him to lose balan"| __truncated__ ...
##  $ ReturnToWorkDate        : POSIXct[1:134004], format: "2009-12-08" "2009-06-26" ...
##  $ AverageWeeklyWage       : chr [1:134004] "639.59" "NULL" "1649.00" "NULL" ...
##  $ ClaimantOpenedDate      : POSIXct[1:134004], format: "2009-07-02" "2009-07-02" ...
##  $ ClaimantClosedDate      : POSIXct[1:134004], format: "2010-07-20" "2009-11-25" ...
##  $ EmployerNotificationDate: POSIXct[1:134004], format: "2009-06-29" "2009-07-01" ...
##  $ ReceivedDate            : POSIXct[1:134004], format: "2009-07-02" "2009-07-02" ...
##  $ IsDenied                : chr [1:134004] "0" "0" "0" "0" ...
##  $ Gender                  : chr [1:134004] "Male" "Male" "Male" "Male" ...
##  $ ClaimantType            : chr [1:134004] "Indemnity" "Medical Only" "Indemnity" "Medical Only" ...
##  $ InjuryNature            : chr [1:134004] "Strain" "Strain" "Fracture" "Contusion" ...
##  $ BodyPartRegion          : chr [1:134004] "Trunk" "Trunk" "Upper Extremities" "Upper Extremities" ...
##  $ BodyPart                : chr [1:134004] "Lower Back Area" "Lower Back Area" "Hand" "Shoulder(S)" ...
##  $ semana                  : num [1:134004] 49 25 28 23 NA 25 26 25 NA NA ...
# Crear una nueva columna llamada Numero_de_Semana
Claims$ClaimantClosedDate2 <- week(Claims$ClaimantClosedDate)
Claims$ClaimantOpenedDate2 <- week(Claims$ClaimantOpenedDate)
Claims$EmployerNotificationDate2 <- week(Claims$EmployerNotificationDate)
Claims$ReturnToWorkDate2 <- week(Claims$ReturnToWorkDate)

## Elimar columnas 
Claims <- Claims %>%
  select(- ClaimantClosedDate, -ClaimantOpenedDate, -EmployerNotificationDate, -ReturnToWorkDate)

str(Claims)
## tibble [134,004 × 22] (S3: tbl_df/tbl/data.frame)
##  $ ClaimID                  : num [1:134004] 650915 650916 650917 650918 650919 ...
##  $ TotalPaid                : chr [1:134004] "11947.55" "0.00" "9295.89" "1026.29" ...
##  $ TotalReserves            : chr [1:134004] "0.00" "0.00" "0.00" "0.00" ...
##  $ TotalRecovery            : chr [1:134004] "0.00" "0.00" "0.00" "0.00" ...
##  $ IndemnityPaid            : chr [1:134004] "243.65" "0.00" "0.00" "0.00" ...
##  $ OtherPaid                : chr [1:134004] "11703.90" "0.00" "9295.89" "1026.29" ...
##  $ ClaimStatus              : chr [1:134004] "C" "C" "C" "C" ...
##  $ IncidentDate             : POSIXct[1:134004], format: "2009-06-17" "2009-06-26" ...
##  $ IncidentDescription      : chr [1:134004] "Employee was moving concrete rings and installing a meter. He strained lower back." "Employee was pulling lining. He felt a pop in the back causing a strain." "Employee was in the restroom. He heard a scream from another restroom that startled him and he fell on his left"| __truncated__ "Employee was unloading truck using a pallet jack to unload heavy equipment. Heavy load caused him to lose balan"| __truncated__ ...
##  $ AverageWeeklyWage        : chr [1:134004] "639.59" "NULL" "1649.00" "NULL" ...
##  $ ReceivedDate             : POSIXct[1:134004], format: "2009-07-02" "2009-07-02" ...
##  $ IsDenied                 : chr [1:134004] "0" "0" "0" "0" ...
##  $ Gender                   : chr [1:134004] "Male" "Male" "Male" "Male" ...
##  $ ClaimantType             : chr [1:134004] "Indemnity" "Medical Only" "Indemnity" "Medical Only" ...
##  $ InjuryNature             : chr [1:134004] "Strain" "Strain" "Fracture" "Contusion" ...
##  $ BodyPartRegion           : chr [1:134004] "Trunk" "Trunk" "Upper Extremities" "Upper Extremities" ...
##  $ BodyPart                 : chr [1:134004] "Lower Back Area" "Lower Back Area" "Hand" "Shoulder(S)" ...
##  $ semana                   : num [1:134004] 49 25 28 23 NA 25 26 25 NA NA ...
##  $ ClaimantClosedDate2      : num [1:134004] 29 47 13 13 18 47 47 8 13 13 ...
##  $ ClaimantOpenedDate2      : num [1:134004] 27 27 27 27 27 27 27 27 30 29 ...
##  $ EmployerNotificationDate2: num [1:134004] 26 26 26 25 26 26 26 26 27 26 ...
##  $ ReturnToWorkDate2        : num [1:134004] 49 26 28 24 NA 26 26 26 NA NA ...

4. Hacer la formula Total Incurred Cost per Claim

Claims$TotalPaid <- as.numeric(Claims$TotalPaid)
Claims$TotalReserves <- as.numeric(Claims$TotalReserves)
Claims$TotalRecovery <- as.numeric(Claims$TotalRecovery)


Claims <- transform(Claims, Total_Incurred_Cost_Claim = TotalReserves + TotalPaid - TotalRecovery)

summary(Claims)
##     ClaimID           TotalPaid       TotalReserves     TotalRecovery      
##  Min.   :  650915   Min.   :   -270   Min.   :      0   Min.   :     0.00  
##  1st Qu.:  811125   1st Qu.:     60   1st Qu.:      0   1st Qu.:     0.00  
##  Median :  844626   Median :    235   Median :      0   Median :     0.00  
##  Mean   :10149151   Mean   :   6746   Mean   :   2233   Mean   :    68.88  
##  3rd Qu.:22716506   3rd Qu.:    938   3rd Qu.:      0   3rd Qu.:     0.00  
##  Max.   :62203891   Max.   :4527291   Max.   :2069575   Max.   :130541.03  
##                                                                            
##  IndemnityPaid       OtherPaid         ClaimStatus       
##  Length:134004      Length:134004      Length:134004     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   IncidentDate                   IncidentDescription AverageWeeklyWage 
##  Min.   :1947-02-24 00:00:00.0   Length:134004       Length:134004     
##  1st Qu.:1998-12-21 00:00:00.0   Class :character    Class :character  
##  Median :2004-01-05 00:00:00.0   Mode  :character    Mode  :character  
##  Mean   :2003-12-08 16:48:47.2                                         
##  3rd Qu.:2009-02-02 00:00:00.0                                         
##  Max.   :2014-06-27 00:00:00.0                                         
##                                                                        
##   ReceivedDate                     IsDenied            Gender         
##  Min.   :1947-02-24 00:00:00.0   Length:134004      Length:134004     
##  1st Qu.:1999-02-09 00:00:00.0   Class :character   Class :character  
##  Median :2004-02-13 00:00:00.0   Mode  :character   Mode  :character  
##  Mean   :2004-07-19 11:29:03.0                                        
##  3rd Qu.:2009-02-27 00:00:00.0                                        
##  Max.   :9999-07-21 00:00:00.0                                        
##                                                                       
##  ClaimantType       InjuryNature       BodyPartRegion       BodyPart        
##  Length:134004      Length:134004      Length:134004      Length:134004     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      semana      ClaimantClosedDate2 ClaimantOpenedDate2
##  Min.   : 0.00   Min.   : 1.00       Min.   : 1.00      
##  1st Qu.:13.00   1st Qu.:13.00       1st Qu.:14.00      
##  Median :25.00   Median :14.00       Median :26.00      
##  Mean   :25.62   Mean   :21.55       Mean   :25.95      
##  3rd Qu.:38.00   3rd Qu.:30.00       3rd Qu.:38.00      
##  Max.   :53.00   Max.   :53.00       Max.   :53.00      
##  NA's   :58637   NA's   :4678                           
##  EmployerNotificationDate2 ReturnToWorkDate2 Total_Incurred_Cost_Claim
##  Min.   : 1.00             Min.   : 1.00     Min.   : -11775          
##  1st Qu.:14.00             1st Qu.:14.00     1st Qu.:     59          
##  Median :25.00             Median :26.00     Median :    234          
##  Mean   :25.77             Mean   :26.02     Mean   :   8910          
##  3rd Qu.:38.00             3rd Qu.:38.00     3rd Qu.:    965          
##  Max.   :53.00             Max.   :53.00     Max.   :5054823          
##  NA's   :22288             NA's   :58637
write.csv(Claims, file="Claims_limpia.csv", row.names = FALSE)