# Importar las bibliotecas necesarias para el análisis de datos
library(readxl) # Para leer archivos Excel
library(dplyr) # Para manipular los datos
library(stringr) # Para trabajar con cadenas de texto
library(dplyr)
excel_claims <- read_excel("/Users/sarahyzayas/Library/Mobile Documents/com~apple~CloudDocs/1. TEC /7 sem/M4_Raul /Act 4.6/ClaimsData2018.xlsx")
archivo_csv <- "Claims.csv" # Ruta del archivo CSV de salida
write.csv(excel_claims, file = archivo_csv, row.names = FALSE)
NOTA: se usó este método para evitar errores en el formato de fecha.
claims <- read.csv("Claims.csv")
transactions <- read.csv("/Users/sarahyzayas/Library/Mobile Documents/com~apple~CloudDocs/1. TEC /7 sem/M4_Raul /Act 4.6/TransactionsSummary2018.csv")
completa <- merge(claims, transactions, by = "ClaimID", all = TRUE)
str(completa)
## 'data.frame': 186677 obs. of 26 variables:
## $ ClaimID : num 633915 633917 633918 633919 633920 ...
## $ TotalPaid : num NA NA NA NA NA ...
## $ TotalReserves : num NA NA NA NA NA NA 0 0 0 0 ...
## $ TotalRecovery : num NA NA NA NA NA NA 0 0 0 0 ...
## $ IndemnityPaid : num NA NA NA NA NA ...
## $ OtherPaid : num NA NA NA NA NA ...
## $ ClaimStatus : chr NA NA NA NA ...
## $ IncidentDate : chr NA NA NA NA ...
## $ IncidentDescription : chr NA NA NA NA ...
## $ ReturnToWorkDate : chr NA NA NA NA ...
## $ AverageWeeklyWage : chr NA NA NA NA ...
## $ ClaimantOpenedDate : chr NA NA NA NA ...
## $ ClaimantClosedDate : chr NA NA NA NA ...
## $ EmployerNotificationDate: chr NA NA NA NA ...
## $ ReceivedDate : chr NA NA NA NA ...
## $ IsDenied : int NA NA NA NA NA NA 0 0 0 0 ...
## $ ClaimantAge_at_DOI : chr NA NA NA NA ...
## $ Gender : chr NA NA NA NA ...
## $ ClaimantType : chr NA NA NA NA ...
## $ InjuryNature : chr NA NA NA NA ...
## $ BodyPartRegion : chr NA NA NA NA ...
## $ BodyPart : chr NA NA NA NA ...
## $ BillReviewALE : num 437.6 311.3 74.7 231.3 24.8 ...
## $ Hospital : num 2035 6540 721 107 NA ...
## $ PhysicianOutpatient : num 4742 2247 231 1078 251 ...
## $ Rx : num 65 0 NA 1629.2 56.5 ...
# numerico
completa$AverageWeeklyWage <- as.numeric(completa$AverageWeeklyWage)
completa$ClaimantAge_at_DOI <- as.numeric(completa$ClaimantAge_at_DOI)
# factor (binario)
completa$IsDenied <- as.factor(completa$IsDenied)
# fecha
columnas_fecha <- grep("date", names(completa), ignore.case = TRUE)
convertir_fecha <- function(col) {
formatos <- c("%m/%d/%Y", "%Y-%m-%d", "%d-%m-%Y", "%m-%d-%Y")
for (formato in formatos) {
fecha <- as.Date(col, format = formato)
if (!all(is.na(fecha))) {
return(fecha)
}
}
warning(paste("No se pudo convertir la columna", deparse(substitute(col)), "a fecha"))
return(col)
}
completa[, columnas_fecha] <- lapply(completa[, columnas_fecha], convertir_fecha)
str(completa)
## 'data.frame': 186677 obs. of 26 variables:
## $ ClaimID : num 633915 633917 633918 633919 633920 ...
## $ TotalPaid : num NA NA NA NA NA ...
## $ TotalReserves : num NA NA NA NA NA NA 0 0 0 0 ...
## $ TotalRecovery : num NA NA NA NA NA NA 0 0 0 0 ...
## $ IndemnityPaid : num NA NA NA NA NA ...
## $ OtherPaid : num NA NA NA NA NA ...
## $ ClaimStatus : chr NA NA NA NA ...
## $ IncidentDate : Date, format: NA NA ...
## $ IncidentDescription : chr NA NA NA NA ...
## $ ReturnToWorkDate : Date, format: NA NA ...
## $ AverageWeeklyWage : num NA NA NA NA NA ...
## $ ClaimantOpenedDate : Date, format: NA NA ...
## $ ClaimantClosedDate : Date, format: NA NA ...
## $ EmployerNotificationDate: Date, format: NA NA ...
## $ ReceivedDate : Date, format: NA NA ...
## $ IsDenied : Factor w/ 2 levels "0","1": NA NA NA NA NA NA 1 1 1 1 ...
## $ ClaimantAge_at_DOI : num NA NA NA NA NA NA 49 49 47 61 ...
## $ Gender : chr NA NA NA NA ...
## $ ClaimantType : chr NA NA NA NA ...
## $ InjuryNature : chr NA NA NA NA ...
## $ BodyPartRegion : chr NA NA NA NA ...
## $ BodyPart : chr NA NA NA NA ...
## $ BillReviewALE : num 437.6 311.3 74.7 231.3 24.8 ...
## $ Hospital : num 2035 6540 721 107 NA ...
## $ PhysicianOutpatient : num 4742 2247 231 1078 251 ...
## $ Rx : num 65 0 NA 1629.2 56.5 ...
Todas las variables ya tienen el tipo de dato deseado.
completa$Total_Incurred_Cost_Claim <- completa$TotalReserves +
completa$IndemnityPaid +
completa$OtherPaid -
completa$TotalRecovery
completa$Total_Paid_calc <- completa$IndemnityPaid +
completa$OtherPaid
completa$tiempo_de_procesamiento_days <- as.numeric(difftime(completa$ClaimantClosedDate, completa$ClaimantOpenedDate, units = "days"))
class(completa$Total_Incurred_Cost_Claim)
## [1] "numeric"
class(completa$Total_Paid_calc)
## [1] "numeric"
class(completa$tiempo_de_procesamiento_days)
## [1] "numeric"
OBSERVACIONES: tienen el tipo de dato adecuado.
summary(completa)
## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 633915 Min. : -270 Min. : 0 Min. : 0.00
## 1st Qu.: 810246 1st Qu.: 60 1st Qu.: 0 1st Qu.: 0.00
## Median : 856915 Median : 235 Median : 0 Median : 0.00
## Mean :12344572 Mean : 6746 Mean : 2233 Mean : 68.88
## 3rd Qu.:22716420 3rd Qu.: 938 3rd Qu.: 0 3rd Qu.: 0.00
## Max. :62246496 Max. :4527291 Max. :2069575 Max. :130541.03
## NA's :52673 NA's :52673 NA's :52673
## IndemnityPaid OtherPaid ClaimStatus IncidentDate
## Min. : -475 Min. : -7820 Length:186677 Min. :1947-02-24
## 1st Qu.: 0 1st Qu.: 58 Class :character 1st Qu.:1998-12-21
## Median : 0 Median : 230 Mode :character Median :2004-01-05
## Mean : 3061 Mean : 3685 Mean :2003-12-08
## 3rd Qu.: 0 3rd Qu.: 855 3rd Qu.:2009-02-02
## Max. :640732 Max. :4129915 Max. :2014-06-27
## NA's :52673 NA's :52673 NA's :52673
## IncidentDescription ReturnToWorkDate AverageWeeklyWage
## Length:186677 Min. :1976-10-29 Min. : 0.0
## Class :character 1st Qu.:2002-04-25 1st Qu.: 300.0
## Mode :character Median :2007-07-09 Median : 492.0
## Mean :2006-06-01 Mean : 587.3
## 3rd Qu.:2011-06-01 3rd Qu.: 660.4
## Max. :2015-05-07 Max. :2024000.0
## NA's :111310 NA's :137597
## ClaimantOpenedDate ClaimantClosedDate EmployerNotificationDate
## Min. :1947-02-24 Min. :1999-06-01 Min. :1972-09-10
## 1st Qu.:1999-02-09 1st Qu.:2005-03-31 1st Qu.:2000-03-13
## Median :2004-02-17 Median :2006-04-04 Median :2004-12-28
## Mean :2004-01-23 Mean :2007-05-24 Mean :2005-08-29
## 3rd Qu.:2009-04-09 3rd Qu.:2009-11-11 3rd Qu.:2009-11-03
## Max. :2014-06-30 Max. :2014-06-30 Max. :9999-07-21
## NA's :52673 NA's :57351 NA's :74961
## ReceivedDate IsDenied ClaimantAge_at_DOI Gender
## Min. :1947-02-24 0 :128008 Min. :-8000.00 Length:186677
## 1st Qu.:1999-02-09 1 : 5996 1st Qu.: 33.00 Class :character
## Median :2004-02-13 NA's: 52673 Median : 42.00 Mode :character
## Mean :2004-07-19 Mean : 39.85
## 3rd Qu.:2009-02-27 3rd Qu.: 51.00
## Max. :9999-07-21 Max. : 94.00
## NA's :52673 NA's :97751
## ClaimantType InjuryNature BodyPartRegion BodyPart
## Length:186677 Length:186677 Length:186677 Length:186677
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BillReviewALE Hospital PhysicianOutpatient Rx
## Min. : -456.0 Min. : -12570.4 Min. : -4655.7 Min. : -469.5
## 1st Qu.: 16.0 1st Qu.: 193.9 1st Qu.: 107.6 1st Qu.: 23.3
## Median : 32.0 Median : 559.1 Median : 221.6 Median : 58.3
## Mean : 191.2 Mean : 4394.7 Mean : 1752.3 Mean : 1140.4
## 3rd Qu.: 80.0 3rd Qu.: 2253.4 3rd Qu.: 710.5 3rd Qu.: 174.5
## Max. :56475.3 Max. :2759604.0 Max. :1481468.5 Max. :631635.5
## NA's :139865 NA's :145262 NA's :84986 NA's :145752
## Total_Incurred_Cost_Claim Total_Paid_calc tiempo_de_procesamiento_days
## Min. : -11775 Min. : -270 Min. : 0.0
## 1st Qu.: 59 1st Qu.: 60 1st Qu.: 161.0
## Median : 234 Median : 235 Median : 992.5
## Mean : 8910 Mean : 6746 Mean : 1318.1
## 3rd Qu.: 965 3rd Qu.: 938 3rd Qu.: 2143.0
## Max. :5054823 Max. :4527291 Max. :19177.0
## NA's :52673 NA's :52673 NA's :57351
Si existen, por lo que vamos a eliminarlos en el siguiente paso.
options(scipen = 999)
completa <- completa %>% mutate(across(where(is.numeric), ~ round(., 2)))
eliminar_filas_negativas <- function(df, columnas) {
for (columna in columnas) {
# Elimina filas donde la columna tiene un valor negativo
indices <- which(df[[columna]] < 0)
if (length(indices) > 0) {
df <- df[-indices, ]
}
}
return(df)
}
# Lista de columnas a iterar
columnas <- c('TotalPaid', 'TotalReserves', 'TotalRecovery', 'IndemnityPaid', 'OtherPaid', 'AverageWeeklyWage', 'ClaimantAge_at_DOI', 'BillReviewALE', 'Hospital','PhysicianOutpatient', 'Rx', 'Total_Incurred_Cost_Claim', 'Total_Paid_calc', 'tiempo_de_procesamiento_days')
# Llamada a la función
completa <- eliminar_filas_negativas(completa, columnas)
porcentaje_na_por_variable <- colSums(is.na(completa)) / nrow(completa) * 100
porcentaje_na_por_variable
## ClaimID TotalPaid
## 0.00000 28.24366
## TotalReserves TotalRecovery
## 28.24366 28.24366
## IndemnityPaid OtherPaid
## 28.24366 28.24366
## ClaimStatus IncidentDate
## 28.24366 28.24366
## IncidentDescription ReturnToWorkDate
## 28.24366 59.63230
## AverageWeeklyWage ClaimantOpenedDate
## 73.72570 28.24366
## ClaimantClosedDate EmployerNotificationDate
## 30.75203 40.17838
## ReceivedDate IsDenied
## 28.24366 28.24366
## ClaimantAge_at_DOI Gender
## 52.39199 28.24366
## ClaimantType InjuryNature
## 28.24366 28.24366
## BodyPartRegion BodyPart
## 28.24366 28.24366
## BillReviewALE Hospital
## 74.92331 77.82533
## PhysicianOutpatient Rx
## 45.52441 78.08276
## Total_Incurred_Cost_Claim Total_Paid_calc
## 28.24366 28.24366
## tiempo_de_procesamiento_days
## 30.75203
boxplot(completa$TotalPaid, main="Boxplot de TotalPaid", horizontal = TRUE)
boxplot(completa$TotalReserves, main="Boxplot de Total Reserves", horizontal = TRUE)
boxplot(completa$TotalRecovery, main="Boxplot de Total Recovery", horizontal = TRUE)
boxplot(completa$IndemnityPaid, main="Boxplot Indemnity Paid", horizontal = TRUE)
boxplot(completa$OtherPaid, main="Boxplot de Other Paid", horizontal = TRUE)
boxplot(completa$AverageWeeklyWage, main="AverageWeeklyWage", horizontal = TRUE)
boxplot(completa$ClaimantAge_at_DOI, main="Boxplot de ClimantAge_at_DOI", horizontal = TRUE)
boxplot(completa$Total_Paid_calc, main="Boxplot de TotalPaid_calc", horizontal = TRUE)
boxplot(completa$Total_Incurred_Cost_Claim, main="Boxplot de Total Incurred Cost Claim", horizontal = TRUE)
boxplot(completa$tiempo_de_procesamiento_days, main="Boxplot de tiempo de procesamiento days", horizontal = TRUE)
OBSERVACIONES: todas las variables tienen dispersiones muy altas, a
excepción de “Climant Age at DOI.
#Los eliminamos solo en las variables que necesitaremos para los clusters
df_clusters <- completa[, c("Total_Paid_calc", "Total_Incurred_Cost_Claim", "tiempo_de_procesamiento_days")]
# nombre de las variables que necesitan quitar outliers
columnas <- c('Total_Incurred_Cost_Claim', 'Total_Paid_calc','tiempo_de_procesamiento_days')
#bucle para eliminar valores arriba del limite superior
for (col in columnas) {
Q3 <- quantile(df_clusters[, col], 0.75, na.rm = TRUE)
IQR_col <- IQR(df_clusters[, col], na.rm = TRUE)
LS <- Q3 + 1.5 * IQR_col
df_clusters <- df_clusters[is.na(df_clusters[, col]) | df_clusters[, col] <= LS, ]
}
summary(df_clusters)
## Total_Paid_calc Total_Incurred_Cost_Claim tiempo_de_procesamiento_days
## Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 129
## Median : 142.2 Median : 141.8 Median :1007
## Mean : 215.0 Mean : 221.9 Mean :1322
## 3rd Qu.: 313.9 3rd Qu.: 316.2 3rd Qu.:2222
## Max. :1023.3 Max. :2300.0 Max. :5355
## NA's :52662 NA's :52662 NA's :53577
NOTA: La técnica de eliminación de outliers utilizó los límites superior e inferior basados en el rango intercuartil (IQR) solo para las variables que se incluirán en el análisis de clusters ya que de hacerlo para todas las vairables numericas, estaríamos perdiendo mucha informción.
summary(completa)
## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 633915 Min. : 0 Min. : 0 Min. : 0.00
## 1st Qu.: 810280 1st Qu.: 60 1st Qu.: 0 1st Qu.: 0.00
## Median : 856982 Median : 235 Median : 0 Median : 0.00
## Mean :12353268 Mean : 6750 Mean : 2236 Mean : 66.83
## 3rd Qu.:22716452 3rd Qu.: 937 3rd Qu.: 0 3rd Qu.: 0.00
## Max. :62246496 Max. :4527291 Max. :2069575 Max. :130541.03
## NA's :52662 NA's :52662 NA's :52662
## IndemnityPaid OtherPaid ClaimStatus IncidentDate
## Min. : 0 Min. : 0 Length:186456 Min. :1947-02-24
## 1st Qu.: 0 1st Qu.: 58 Class :character 1st Qu.:1998-12-21
## Median : 0 Median : 230 Mode :character Median :2004-01-07
## Mean : 3062 Mean : 3687 Mean :2003-12-09
## 3rd Qu.: 0 3rd Qu.: 855 3rd Qu.:2009-02-05
## Max. :640732 Max. :4129915 Max. :2014-06-27
## NA's :52662 NA's :52662 NA's :52662
## IncidentDescription ReturnToWorkDate AverageWeeklyWage
## Length:186456 Min. :1976-10-29 Min. : 0.0
## Class :character 1st Qu.:2002-04-26 1st Qu.: 300.0
## Mode :character Median :2007-07-10 Median : 491.8
## Mean :2006-06-02 Mean : 587.5
## 3rd Qu.:2011-06-01 3rd Qu.: 660.4
## Max. :2015-05-07 Max. :2024000.0
## NA's :111188 NA's :137466
## ClaimantOpenedDate ClaimantClosedDate EmployerNotificationDate
## Min. :1947-02-24 Min. :1999-06-01 Min. :1972-09-10
## 1st Qu.:1999-02-09 1st Qu.:2005-03-31 1st Qu.:2000-03-13
## Median :2004-02-18 Median :2006-04-05 Median :2004-12-29
## Mean :2004-01-24 Mean :2007-05-24 Mean :2005-08-30
## 3rd Qu.:2009-04-14 3rd Qu.:2009-11-16 3rd Qu.:2009-11-04
## Max. :2014-06-30 Max. :2014-06-30 Max. :9999-07-21
## NA's :52662 NA's :57339 NA's :74915
## ReceivedDate IsDenied ClaimantAge_at_DOI Gender
## Min. :1947-02-24 0 :127805 Min. : 0.00 Length:186456
## 1st Qu.:1999-02-09 1 : 5989 1st Qu.:33.00 Class :character
## Median :2004-02-13 NA's: 52662 Median :42.00 Mode :character
## Mean :2004-07-20 Mean :42.05
## 3rd Qu.:2009-03-03 3rd Qu.:51.00
## Max. :9999-07-21 Max. :94.00
## NA's :52662 NA's :97688
## ClaimantType InjuryNature BodyPartRegion BodyPart
## Length:186456 Length:186456 Length:186456 Length:186456
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BillReviewALE Hospital PhysicianOutpatient Rx
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 16.0 1st Qu.: 194.0 1st Qu.: 107.6 1st Qu.: 23.3
## Median : 32.0 Median : 559.1 Median : 221.6 Median : 58.3
## Mean : 191.1 Mean : 4392.8 Mean : 1751.1 Mean : 1138.3
## 3rd Qu.: 80.0 3rd Qu.: 2252.6 3rd Qu.: 710.3 3rd Qu.: 174.4
## Max. :56475.3 Max. :2759604.0 Max. :1481468.5 Max. :631635.5
## NA's :139699 NA's :145110 NA's :84883 NA's :145590
## Total_Incurred_Cost_Claim Total_Paid_calc tiempo_de_procesamiento_days
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 59 1st Qu.: 60 1st Qu.: 161
## Median : 235 Median : 235 Median : 991
## Mean : 8919 Mean : 6750 Mean : 1317
## 3rd Qu.: 968 3rd Qu.: 937 3rd Qu.: 2142
## Max. :5054823 Max. :4527291 Max. :19177
## NA's :52662 NA's :52662 NA's :57339
# Guarda los objetos en un archivo RData
save(completa, df_clusters, file = "2bases_4.6.RData")
# Carga los objetos desde el archivo RData
load("2bases_4.6.RData")
# Exporta el dataframe 'completa' a un archivo CSV
write.csv(completa, file = "completa.csv", row.names = FALSE)
# Exporta el dataframe 'otro_dataframe' a un archivo CSV
write.csv(df_clusters, file = "df_clusters", row.names = FALSE)