Limpiar Memoria
rm(list=ls())
Directorio de Trabajo
getwd()
## [1] "D:/Usuarios/OSCAGAAl/Documents/Asesorias/Clases Diego Tarquino/Analisis Descriptivo o Exploratoria"
Librerias
source(list.files(pattern = "LIBRERIAS.R"))
## Warning: replacing previous import 'vctrs::data_frame' by 'tibble::data_frame'
## when loading 'dplyr'
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
Cargar Datos
df= read_csv("Insumos/UCI_Credit_Card.csv") # <- es lo mismo que igual
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
Extructura
class(df)
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
df= as.data.frame(df)
Visualizacion de los datos fuente : https://rdrr.io/cran/creditmodel/man/UCICreditCard.html
head(df)
Revision de Tipo de variables
#str(df)
glimpse(df)
## Rows: 30,000
## Columns: 25
## $ ID <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1...
## $ LIMIT_BAL <dbl> 20000, 120000, 90000, 50000, 50000, 5000...
## $ SEX <dbl> 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1...
## $ EDUCATION <dbl> 2, 2, 2, 2, 2, 1, 1, 2, 3, 3, 3, 1, 2, 2...
## $ MARRIAGE <dbl> 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2...
## $ AGE <dbl> 24, 26, 34, 37, 57, 37, 29, 23, 28, 35, ...
## $ PAY_0 <dbl> 2, -1, 0, 0, -1, 0, 0, 0, 0, -2, 0, -1, ...
## $ PAY_2 <dbl> 2, 2, 0, 0, 0, 0, 0, -1, 0, -2, 0, -1, 0...
## $ PAY_3 <dbl> -1, 0, 0, 0, -1, 0, 0, -1, 2, -2, 2, -1,...
## $ PAY_4 <dbl> -1, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -1, -...
## $ PAY_5 <dbl> -2, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, -...
## $ PAY_6 <dbl> -2, 2, 0, 0, 0, 0, 0, -1, 0, -1, -1, 2, ...
## $ BILL_AMT1 <dbl> 3913, 2682, 29239, 46990, 8617, 64400, 3...
## $ BILL_AMT2 <dbl> 3102, 1725, 14027, 48233, 5670, 57069, 4...
## $ BILL_AMT3 <dbl> 689, 2682, 13559, 49291, 35835, 57608, 4...
## $ BILL_AMT4 <dbl> 0, 3272, 14331, 28314, 20940, 19394, 542...
## $ BILL_AMT5 <dbl> 0, 3455, 14948, 28959, 19146, 19619, 483...
## $ BILL_AMT6 <dbl> 0, 3261, 15549, 29547, 19131, 20024, 473...
## $ PAY_AMT1 <dbl> 0, 0, 1518, 2000, 2000, 2500, 55000, 380...
## $ PAY_AMT2 <dbl> 689, 1000, 1500, 2019, 36681, 1815, 4000...
## $ PAY_AMT3 <dbl> 0, 1000, 1000, 1200, 10000, 657, 38000, ...
## $ PAY_AMT4 <dbl> 0, 1000, 1000, 1100, 9000, 1000, 20239, ...
## $ PAY_AMT5 <dbl> 0, 0, 1000, 1069, 689, 1000, 13750, 1687...
## $ PAY_AMT6 <dbl> 0, 2000, 5000, 1000, 679, 800, 13770, 15...
## $ default.payment.next.month <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1...
Recodificicar las Variables se recodifica las sigueintes varaibles: * Sex (1=M,2=F) * Eduaction * MARRIAGE
Sexo
df$SEX=as.factor(df$SEX) # convierte en categoria
levels(df$SEX) #
## [1] "1" "2"
levels(df$SEX)=c("M","F")
head(df,3)
Eduaction Education (1 = graduate school; 2 = university; 3 = high school; 4 = others)
df$EDUCATION=as.factor(df$EDUCATION) # convierte en categoria
levels(df$EDUCATION)
## [1] "0" "1" "2" "3" "4" "5" "6"
levels(df$EDUCATION)=c("Desconocido","Posgrado","Pregrado","Bachiller","Otros","Desconocido","Desconocido")
head(df,3)
MARRIAGE Marital status (1 = married; 2 = single; 3 = others)
df$MARRIAGE=as.factor(df$MARRIAGE)
levels(df$MARRIAGE)
## [1] "0" "1" "2" "3"
levels(df$MARRIAGE)=c("Desconocido","Casado","Soltero","Otros")
head(df,3)
Reviamos Datos Nulos o Perdidos
A nivel general de BD
sum(is.na(df))
## [1] 0
A nivel individual
apply(is.na(df),2, sum)
## ID LIMIT_BAL
## 0 0
## SEX EDUCATION
## 0 0
## MARRIAGE AGE
## 0 0
## PAY_0 PAY_2
## 0 0
## PAY_3 PAY_4
## 0 0
## PAY_5 PAY_6
## 0 0
## BILL_AMT1 BILL_AMT2
## 0 0
## BILL_AMT3 BILL_AMT4
## 0 0
## BILL_AMT5 BILL_AMT6
## 0 0
## PAY_AMT1 PAY_AMT2
## 0 0
## PAY_AMT3 PAY_AMT4
## 0 0
## PAY_AMT5 PAY_AMT6
## 0 0
## default.payment.next.month
## 0
Revisar variables Categoricas table de frecuncia de Educacion
table(df$EDUCATION)
##
## Desconocido Posgrado Pregrado Bachiller Otros
## 345 10585 14030 4917 123
Sustutuir Desconocido por NA
df$EDUCATION[df$EDUCATION=="Desconocido"]=NA
sum(is.na(df))
## [1] 345
table de frecuncia de Estado Civil
table(df$MARRIAGE)
##
## Desconocido Casado Soltero Otros
## 54 13659 15964 323
Sutitucion de Desconocido por Na
df$MARRIAGE[df$MARRIAGE=="Desconocido"]=NA
apply(is.na(df),2, sum)
## ID LIMIT_BAL
## 0 0
## SEX EDUCATION
## 0 345
## MARRIAGE AGE
## 54 0
## PAY_0 PAY_2
## 0 0
## PAY_3 PAY_4
## 0 0
## PAY_5 PAY_6
## 0 0
## BILL_AMT1 BILL_AMT2
## 0 0
## BILL_AMT3 BILL_AMT4
## 0 0
## BILL_AMT5 BILL_AMT6
## 0 0
## PAY_AMT1 PAY_AMT2
## 0 0
## PAY_AMT3 PAY_AMT4
## 0 0
## PAY_AMT5 PAY_AMT6
## 0 0
## default.payment.next.month
## 0
Propocion de Valores NA en Educacion
Na_educacion= df[is.na(df$EDUCATION),]
write_xlsx(Na_educacion,"Na_Educacion.xlsx")
Propocion de Valores NA en Estado civil
Na_Estado_C= df[is.na(df$MARRIAGE),]
write_xlsx(Na_Estado_C,"Na_Estado_C.xlsx")
Unirlas
NAs_Data_completa= rbind(Na_educacion,Na_Estado_C)
write_xlsx(NAs_Data_completa,"NAs_Data_completa.xlsx")
Eliminar Na de la Base de datos df
df=na.omit(df)
sum(is.na(df))
## [1] 0
(399/29601)*100
## [1] 1.347927
names(df)
## [1] "ID" "LIMIT_BAL"
## [3] "SEX" "EDUCATION"
## [5] "MARRIAGE" "AGE"
## [7] "PAY_0" "PAY_2"
## [9] "PAY_3" "PAY_4"
## [11] "PAY_5" "PAY_6"
## [13] "BILL_AMT1" "BILL_AMT2"
## [15] "BILL_AMT3" "BILL_AMT4"
## [17] "BILL_AMT5" "BILL_AMT6"
## [19] "PAY_AMT1" "PAY_AMT2"
## [21] "PAY_AMT3" "PAY_AMT4"
## [23] "PAY_AMT5" "PAY_AMT6"
## [25] "default.payment.next.month"
Renombrar Variables
names(df)[25]="default"
Tipo de variables
class(df$default)
## [1] "numeric"
Tabla de conteo
table(df$default)
##
## 0 1
## 22996 6605
Recodificar la variable default a Cetegorica
df$default=as.factor(df$default)
Exportar la Base Depurada
formato csv
write.csv(df,"D:/Usuarios/OSCAGAAl/Documents/Asesorias/Clases Diego Tarquino/Analisis Descriptivo o Exploratoria/Insumos/Dafault_Depurada.csv",row.names = FALSE)
Formato Excel
write_xlsx(df,"D:/Usuarios/OSCAGAAl/Documents/Asesorias/Clases Diego Tarquino/Analisis Descriptivo o Exploratoria/Insumos/Dafault_Depurada.xlsx")