Analisis Exploratorio y Descriptivo de Datos

Depuracion de Base de Datos

Cargar Base de Datos

Limpiar Memoria

rm(list=ls())

Directorio de Trabajo

getwd()
## [1] "D:/Usuarios/OSCAGAAl/Documents/Asesorias/Clases Diego Tarquino/Analisis Descriptivo o Exploratoria"

Librerias

source(list.files(pattern = "LIBRERIAS.R"))
## Warning: replacing previous import 'vctrs::data_frame' by 'tibble::data_frame'
## when loading 'dplyr'
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Cargar Datos

df= read_csv("Insumos/UCI_Credit_Card.csv") #  <- es lo mismo que igual
## Parsed with column specification:
## cols(
##   .default = col_double()
## )
## See spec(...) for full column specifications.

Extructura

class(df)
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"
df= as.data.frame(df)

Visualizacion de los datos fuente : https://rdrr.io/cran/creditmodel/man/UCICreditCard.html

head(df)

Revision de Tipo de variables

#str(df)
glimpse(df)
## Rows: 30,000
## Columns: 25
## $ ID                         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1...
## $ LIMIT_BAL                  <dbl> 20000, 120000, 90000, 50000, 50000, 5000...
## $ SEX                        <dbl> 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1...
## $ EDUCATION                  <dbl> 2, 2, 2, 2, 2, 1, 1, 2, 3, 3, 3, 1, 2, 2...
## $ MARRIAGE                   <dbl> 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2...
## $ AGE                        <dbl> 24, 26, 34, 37, 57, 37, 29, 23, 28, 35, ...
## $ PAY_0                      <dbl> 2, -1, 0, 0, -1, 0, 0, 0, 0, -2, 0, -1, ...
## $ PAY_2                      <dbl> 2, 2, 0, 0, 0, 0, 0, -1, 0, -2, 0, -1, 0...
## $ PAY_3                      <dbl> -1, 0, 0, 0, -1, 0, 0, -1, 2, -2, 2, -1,...
## $ PAY_4                      <dbl> -1, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, -...
## $ PAY_5                      <dbl> -2, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, -...
## $ PAY_6                      <dbl> -2, 2, 0, 0, 0, 0, 0, -1, 0, -1, -1, 2, ...
## $ BILL_AMT1                  <dbl> 3913, 2682, 29239, 46990, 8617, 64400, 3...
## $ BILL_AMT2                  <dbl> 3102, 1725, 14027, 48233, 5670, 57069, 4...
## $ BILL_AMT3                  <dbl> 689, 2682, 13559, 49291, 35835, 57608, 4...
## $ BILL_AMT4                  <dbl> 0, 3272, 14331, 28314, 20940, 19394, 542...
## $ BILL_AMT5                  <dbl> 0, 3455, 14948, 28959, 19146, 19619, 483...
## $ BILL_AMT6                  <dbl> 0, 3261, 15549, 29547, 19131, 20024, 473...
## $ PAY_AMT1                   <dbl> 0, 0, 1518, 2000, 2000, 2500, 55000, 380...
## $ PAY_AMT2                   <dbl> 689, 1000, 1500, 2019, 36681, 1815, 4000...
## $ PAY_AMT3                   <dbl> 0, 1000, 1000, 1200, 10000, 657, 38000, ...
## $ PAY_AMT4                   <dbl> 0, 1000, 1000, 1100, 9000, 1000, 20239, ...
## $ PAY_AMT5                   <dbl> 0, 0, 1000, 1069, 689, 1000, 13750, 1687...
## $ PAY_AMT6                   <dbl> 0, 2000, 5000, 1000, 679, 800, 13770, 15...
## $ default.payment.next.month <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1...

Recodificicar las Variables se recodifica las sigueintes varaibles: * Sex (1=M,2=F) * Eduaction * MARRIAGE

Sexo

df$SEX=as.factor(df$SEX) # convierte en categoria
levels(df$SEX) # 
## [1] "1" "2"
levels(df$SEX)=c("M","F")
head(df,3)

Eduaction Education (1 = graduate school; 2 = university; 3 = high school; 4 = others)

df$EDUCATION=as.factor(df$EDUCATION) # convierte en categoria
levels(df$EDUCATION)
## [1] "0" "1" "2" "3" "4" "5" "6"
levels(df$EDUCATION)=c("Desconocido","Posgrado","Pregrado","Bachiller","Otros","Desconocido","Desconocido")
head(df,3)

MARRIAGE Marital status (1 = married; 2 = single; 3 = others)

df$MARRIAGE=as.factor(df$MARRIAGE)
levels(df$MARRIAGE)
## [1] "0" "1" "2" "3"
levels(df$MARRIAGE)=c("Desconocido","Casado","Soltero","Otros")
head(df,3)

Reviamos Datos Nulos o Perdidos

A nivel general de BD

sum(is.na(df))
## [1] 0

A nivel individual

apply(is.na(df),2, sum)
##                         ID                  LIMIT_BAL 
##                          0                          0 
##                        SEX                  EDUCATION 
##                          0                          0 
##                   MARRIAGE                        AGE 
##                          0                          0 
##                      PAY_0                      PAY_2 
##                          0                          0 
##                      PAY_3                      PAY_4 
##                          0                          0 
##                      PAY_5                      PAY_6 
##                          0                          0 
##                  BILL_AMT1                  BILL_AMT2 
##                          0                          0 
##                  BILL_AMT3                  BILL_AMT4 
##                          0                          0 
##                  BILL_AMT5                  BILL_AMT6 
##                          0                          0 
##                   PAY_AMT1                   PAY_AMT2 
##                          0                          0 
##                   PAY_AMT3                   PAY_AMT4 
##                          0                          0 
##                   PAY_AMT5                   PAY_AMT6 
##                          0                          0 
## default.payment.next.month 
##                          0

Revisar variables Categoricas table de frecuncia de Educacion

table(df$EDUCATION)
## 
## Desconocido    Posgrado    Pregrado   Bachiller       Otros 
##         345       10585       14030        4917         123

Sustutuir Desconocido por NA

df$EDUCATION[df$EDUCATION=="Desconocido"]=NA
sum(is.na(df))
## [1] 345

table de frecuncia de Estado Civil

table(df$MARRIAGE)
## 
## Desconocido      Casado     Soltero       Otros 
##          54       13659       15964         323

Sutitucion de Desconocido por Na

df$MARRIAGE[df$MARRIAGE=="Desconocido"]=NA
apply(is.na(df),2, sum)
##                         ID                  LIMIT_BAL 
##                          0                          0 
##                        SEX                  EDUCATION 
##                          0                        345 
##                   MARRIAGE                        AGE 
##                         54                          0 
##                      PAY_0                      PAY_2 
##                          0                          0 
##                      PAY_3                      PAY_4 
##                          0                          0 
##                      PAY_5                      PAY_6 
##                          0                          0 
##                  BILL_AMT1                  BILL_AMT2 
##                          0                          0 
##                  BILL_AMT3                  BILL_AMT4 
##                          0                          0 
##                  BILL_AMT5                  BILL_AMT6 
##                          0                          0 
##                   PAY_AMT1                   PAY_AMT2 
##                          0                          0 
##                   PAY_AMT3                   PAY_AMT4 
##                          0                          0 
##                   PAY_AMT5                   PAY_AMT6 
##                          0                          0 
## default.payment.next.month 
##                          0

Propocion de Valores NA en Educacion

Na_educacion= df[is.na(df$EDUCATION),]
write_xlsx(Na_educacion,"Na_Educacion.xlsx")

Propocion de Valores NA en Estado civil

Na_Estado_C= df[is.na(df$MARRIAGE),]
write_xlsx(Na_Estado_C,"Na_Estado_C.xlsx")

Unirlas

NAs_Data_completa= rbind(Na_educacion,Na_Estado_C)
write_xlsx(NAs_Data_completa,"NAs_Data_completa.xlsx")

Eliminar Na de la Base de datos df

df=na.omit(df)
sum(is.na(df))
## [1] 0
(399/29601)*100
## [1] 1.347927

tenemos una Base depurada

names(df)
##  [1] "ID"                         "LIMIT_BAL"                 
##  [3] "SEX"                        "EDUCATION"                 
##  [5] "MARRIAGE"                   "AGE"                       
##  [7] "PAY_0"                      "PAY_2"                     
##  [9] "PAY_3"                      "PAY_4"                     
## [11] "PAY_5"                      "PAY_6"                     
## [13] "BILL_AMT1"                  "BILL_AMT2"                 
## [15] "BILL_AMT3"                  "BILL_AMT4"                 
## [17] "BILL_AMT5"                  "BILL_AMT6"                 
## [19] "PAY_AMT1"                   "PAY_AMT2"                  
## [21] "PAY_AMT3"                   "PAY_AMT4"                  
## [23] "PAY_AMT5"                   "PAY_AMT6"                  
## [25] "default.payment.next.month"

Renombrar Variables

names(df)[25]="default"

Tipo de variables

class(df$default)
## [1] "numeric"

Tabla de conteo

table(df$default)
## 
##     0     1 
## 22996  6605

Recodificar la variable default a Cetegorica

df$default=as.factor(df$default)

Exportar la Base Depurada

formato csv

write.csv(df,"D:/Usuarios/OSCAGAAl/Documents/Asesorias/Clases Diego Tarquino/Analisis Descriptivo o Exploratoria/Insumos/Dafault_Depurada.csv",row.names = FALSE)

Formato Excel

write_xlsx(df,"D:/Usuarios/OSCAGAAl/Documents/Asesorias/Clases Diego Tarquino/Analisis Descriptivo o Exploratoria/Insumos/Dafault_Depurada.xlsx")