1. Cargar librarías y base de datos
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
#file.choose()
Claims <- read_excel("/Users/marianaguevara/Downloads/ClaimsData2018.xlsx")
View(Claims)
2. Entender la base de datos
summary(Claims)
## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Length:134004 Length:134004 Length:134004
## 1st Qu.: 811125 Class :character Class :character Class :character
## Median : 844626 Mode :character Mode :character Mode :character
## Mean :10149151
## 3rd Qu.:22716506
## Max. :62203891
##
## IndemnityPaid OtherPaid ClaimStatus
## Length:134004 Length:134004 Length:134004
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## IncidentDate IncidentDescription
## Min. :1947-02-24 00:00:00.0 Length:134004
## 1st Qu.:1998-12-21 00:00:00.0 Class :character
## Median :2004-01-05 00:00:00.0 Mode :character
## Mean :2003-12-08 16:48:47.2
## 3rd Qu.:2009-02-02 00:00:00.0
## Max. :2014-06-27 00:00:00.0
##
## ReturnToWorkDate AverageWeeklyWage
## Min. :1976-10-29 00:00:00.00 Length:134004
## 1st Qu.:2002-04-25 00:00:00.00 Class :character
## Median :2007-07-09 00:00:00.00 Mode :character
## Mean :2006-06-01 15:30:12.00
## 3rd Qu.:2011-06-01 00:00:00.00
## Max. :2015-05-07 00:00:00.00
## NA's :58637
## ClaimantOpenedDate ClaimantClosedDate
## Min. :1947-02-24 00:00:00.0 Min. :1999-06-01 00:00:00.00
## 1st Qu.:1999-02-09 00:00:00.0 1st Qu.:2005-03-31 00:00:00.00
## Median :2004-02-17 00:00:00.0 Median :2006-04-04 12:00:00.00
## Mean :2004-01-23 11:32:07.5 Mean :2007-05-24 02:11:30.00
## 3rd Qu.:2009-04-09 06:00:00.0 3rd Qu.:2009-11-11 00:00:00.00
## Max. :2014-06-30 00:00:00.0 Max. :2014-06-30 00:00:00.00
## NA's :4678
## EmployerNotificationDate ReceivedDate
## Min. :1972-09-10 00:00:00.0 Min. :1947-02-24 00:00:00.0
## 1st Qu.:2000-03-13 00:00:00.0 1st Qu.:1999-02-09 00:00:00.0
## Median :2004-12-28 00:00:00.0 Median :2004-02-13 00:00:00.0
## Mean :2005-08-29 04:49:34.2 Mean :2004-07-19 11:29:03.0
## 3rd Qu.:2009-11-03 00:00:00.0 3rd Qu.:2009-02-27 00:00:00.0
## Max. :9999-07-21 00:00:00.0 Max. :9999-07-21 00:00:00.0
## NA's :22288
## IsDenied ClaimantAge_at_DOI Gender ClaimantType
## Length:134004 Length:134004 Length:134004 Length:134004
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## InjuryNature BodyPartRegion BodyPart
## Length:134004 Length:134004 Length:134004
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
head(Claims)
## # A tibble: 6 × 22
## ClaimID TotalPaid TotalReserves TotalRecovery IndemnityPaid OtherPaid
## <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 650915 11947.55 0.00 0.00 243.65 11703.90
## 2 650916 0.00 0.00 0.00 0.00 0.00
## 3 650917 9295.89 0.00 0.00 0.00 9295.89
## 4 650918 1026.29 0.00 0.00 0.00 1026.29
## 5 650919 43108.03 0.00 0.00 40000.00 3108.03
## 6 650920 331.90 0.00 0.00 0.00 331.90
## # ℹ 16 more variables: ClaimStatus <chr>, IncidentDate <dttm>,
## # IncidentDescription <chr>, ReturnToWorkDate <dttm>,
## # AverageWeeklyWage <chr>, ClaimantOpenedDate <dttm>,
## # ClaimantClosedDate <dttm>, EmployerNotificationDate <dttm>,
## # ReceivedDate <dttm>, IsDenied <chr>, ClaimantAge_at_DOI <chr>,
## # Gender <chr>, ClaimantType <chr>, InjuryNature <chr>, BodyPartRegion <chr>,
## # BodyPart <chr>
3. Limpiar la base de datos
## Elimar columnas
Claims <- Claims %>%
select(- ClaimantAge_at_DOI)
#¿Cuantos NA's tengo en la bd?
sum(is.na(Claims))
## [1] 85603
#Cuantos NA's tengo por variable
sapply(Claims, function(x) sum(is.na(x)))
## ClaimID TotalPaid TotalReserves
## 0 0 0
## TotalRecovery IndemnityPaid OtherPaid
## 0 0 0
## ClaimStatus IncidentDate IncidentDescription
## 0 0 0
## ReturnToWorkDate AverageWeeklyWage ClaimantOpenedDate
## 58637 0 0
## ClaimantClosedDate EmployerNotificationDate ReceivedDate
## 4678 22288 0
## IsDenied Gender ClaimantType
## 0 0 0
## InjuryNature BodyPartRegion BodyPart
## 0 0 0
#Convertir la columna fecha a semanas
Claims$semana <- as.numeric(strftime(Claims$ClaimantClosedDate, format = "%U"))
Claims$semana <- as.numeric(strftime(Claims$EmployerNotificationDate, format = "%U"))
Claims$semana <- as.numeric(strftime(Claims$ReturnToWorkDate, format = "%U"))
str(Claims)
## tibble [134,004 × 22] (S3: tbl_df/tbl/data.frame)
## $ ClaimID : num [1:134004] 650915 650916 650917 650918 650919 ...
## $ TotalPaid : chr [1:134004] "11947.55" "0.00" "9295.89" "1026.29" ...
## $ TotalReserves : chr [1:134004] "0.00" "0.00" "0.00" "0.00" ...
## $ TotalRecovery : chr [1:134004] "0.00" "0.00" "0.00" "0.00" ...
## $ IndemnityPaid : chr [1:134004] "243.65" "0.00" "0.00" "0.00" ...
## $ OtherPaid : chr [1:134004] "11703.90" "0.00" "9295.89" "1026.29" ...
## $ ClaimStatus : chr [1:134004] "C" "C" "C" "C" ...
## $ IncidentDate : POSIXct[1:134004], format: "2009-06-17" "2009-06-26" ...
## $ IncidentDescription : chr [1:134004] "Employee was moving concrete rings and installing a meter. He strained lower back." "Employee was pulling lining. He felt a pop in the back causing a strain." "Employee was in the restroom. He heard a scream from another restroom that startled him and he fell on his left"| __truncated__ "Employee was unloading truck using a pallet jack to unload heavy equipment. Heavy load caused him to lose balan"| __truncated__ ...
## $ ReturnToWorkDate : POSIXct[1:134004], format: "2009-12-08" "2009-06-26" ...
## $ AverageWeeklyWage : chr [1:134004] "639.59" "NULL" "1649.00" "NULL" ...
## $ ClaimantOpenedDate : POSIXct[1:134004], format: "2009-07-02" "2009-07-02" ...
## $ ClaimantClosedDate : POSIXct[1:134004], format: "2010-07-20" "2009-11-25" ...
## $ EmployerNotificationDate: POSIXct[1:134004], format: "2009-06-29" "2009-07-01" ...
## $ ReceivedDate : POSIXct[1:134004], format: "2009-07-02" "2009-07-02" ...
## $ IsDenied : chr [1:134004] "0" "0" "0" "0" ...
## $ Gender : chr [1:134004] "Male" "Male" "Male" "Male" ...
## $ ClaimantType : chr [1:134004] "Indemnity" "Medical Only" "Indemnity" "Medical Only" ...
## $ InjuryNature : chr [1:134004] "Strain" "Strain" "Fracture" "Contusion" ...
## $ BodyPartRegion : chr [1:134004] "Trunk" "Trunk" "Upper Extremities" "Upper Extremities" ...
## $ BodyPart : chr [1:134004] "Lower Back Area" "Lower Back Area" "Hand" "Shoulder(S)" ...
## $ semana : num [1:134004] 49 25 28 23 NA 25 26 25 NA NA ...
# Crear una nueva columna llamada Numero_de_Semana
Claims$ClaimantClosedDate2 <- week(Claims$ClaimantClosedDate)
Claims$ClaimantOpenedDate2 <- week(Claims$ClaimantOpenedDate)
Claims$EmployerNotificationDate2 <- week(Claims$EmployerNotificationDate)
Claims$ReturnToWorkDate2 <- week(Claims$ReturnToWorkDate)
## Elimar columnas
Claims <- Claims %>%
select(- ClaimantClosedDate, -ClaimantOpenedDate, -EmployerNotificationDate, -ReturnToWorkDate)
str(Claims)
## tibble [134,004 × 22] (S3: tbl_df/tbl/data.frame)
## $ ClaimID : num [1:134004] 650915 650916 650917 650918 650919 ...
## $ TotalPaid : chr [1:134004] "11947.55" "0.00" "9295.89" "1026.29" ...
## $ TotalReserves : chr [1:134004] "0.00" "0.00" "0.00" "0.00" ...
## $ TotalRecovery : chr [1:134004] "0.00" "0.00" "0.00" "0.00" ...
## $ IndemnityPaid : chr [1:134004] "243.65" "0.00" "0.00" "0.00" ...
## $ OtherPaid : chr [1:134004] "11703.90" "0.00" "9295.89" "1026.29" ...
## $ ClaimStatus : chr [1:134004] "C" "C" "C" "C" ...
## $ IncidentDate : POSIXct[1:134004], format: "2009-06-17" "2009-06-26" ...
## $ IncidentDescription : chr [1:134004] "Employee was moving concrete rings and installing a meter. He strained lower back." "Employee was pulling lining. He felt a pop in the back causing a strain." "Employee was in the restroom. He heard a scream from another restroom that startled him and he fell on his left"| __truncated__ "Employee was unloading truck using a pallet jack to unload heavy equipment. Heavy load caused him to lose balan"| __truncated__ ...
## $ AverageWeeklyWage : chr [1:134004] "639.59" "NULL" "1649.00" "NULL" ...
## $ ReceivedDate : POSIXct[1:134004], format: "2009-07-02" "2009-07-02" ...
## $ IsDenied : chr [1:134004] "0" "0" "0" "0" ...
## $ Gender : chr [1:134004] "Male" "Male" "Male" "Male" ...
## $ ClaimantType : chr [1:134004] "Indemnity" "Medical Only" "Indemnity" "Medical Only" ...
## $ InjuryNature : chr [1:134004] "Strain" "Strain" "Fracture" "Contusion" ...
## $ BodyPartRegion : chr [1:134004] "Trunk" "Trunk" "Upper Extremities" "Upper Extremities" ...
## $ BodyPart : chr [1:134004] "Lower Back Area" "Lower Back Area" "Hand" "Shoulder(S)" ...
## $ semana : num [1:134004] 49 25 28 23 NA 25 26 25 NA NA ...
## $ ClaimantClosedDate2 : num [1:134004] 29 47 13 13 18 47 47 8 13 13 ...
## $ ClaimantOpenedDate2 : num [1:134004] 27 27 27 27 27 27 27 27 30 29 ...
## $ EmployerNotificationDate2: num [1:134004] 26 26 26 25 26 26 26 26 27 26 ...
## $ ReturnToWorkDate2 : num [1:134004] 49 26 28 24 NA 26 26 26 NA NA ...