##Introducción

Librerias

En primer lugar validaremos si las librerías que usaremos están instaladas y activadas para ser utilizadas en esta sesión.

if (!require("tidyverse"))
{ install.packages("tidyverse")
library(tidyverse)}

if (!require("miscset"))
{ install.packages("miscset")
library(miscset)}

if (!require("car"))
{ install.packages("car")
library(car)}

if (!require("Hmisc"))
{ install.packages("Hmisc")
library(Hmisc)}

Estructura de los datos

# Setting the working directory
path_loc <- "C:\\TEMPORAL\\6. ESTADÍSTICA\\3. R Studio"
setwd(path_loc)

# reading in the data
df <- read_csv("bank-additional-full.csv")
## Rows: 41188 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): age;job;marital;education;default;housing;loan;contact;month;day_of...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# dimensions of the data
dim_desc(df)
## [1] "[41,188 x 1]"
# names of the data
names(df)
## [1] "age;job;marital;education;default;housing;loan;contact;month;day_of_week;duration;campaign;pdays;previous;poutcome;emp.var.rate;cons.price.idx;cons.conf.idx;euribor3m;nr.employed;y"
readLines("bank-additional-full.csv", n = 4)
## [1] "\"age\";\"job\";\"marital\";\"education\";\"default\";\"housing\";\"loan\";\"contact\";\"month\";\"day_of_week\";\"duration\";\"campaign\";\"pdays\";\"previous\";\"poutcome\";\"emp.var.rate\";\"cons.price.idx\";\"cons.conf.idx\";\"euribor3m\";\"nr.employed\";\"y\""
## [2] "56;\"housemaid\";\"married\";\"basic.4y\";\"no\";\"no\";\"no\";\"telephone\";\"may\";\"mon\";261;1;999;0;\"nonexistent\";1.1;93.994;-36.4;4.857;5191;\"no\""                                                                                                             
## [3] "57;\"services\";\"married\";\"high.school\";\"unknown\";\"no\";\"no\";\"telephone\";\"may\";\"mon\";149;1;999;0;\"nonexistent\";1.1;93.994;-36.4;4.857;5191;\"no\""                                                                                                      
## [4] "37;\"services\";\"married\";\"high.school\";\"no\";\"yes\";\"no\";\"telephone\";\"may\";\"mon\";226;1;999;0;\"nonexistent\";1.1;93.994;-36.4;4.857;5191;\"no\""
df
## # A tibble: 41,188 × 1
##    age;job;marital;education;default;housing;loan;contact;month;day_of_week;du…¹
##    <chr>                                                                        
##  1 "56;\"housemaid\";\"married\";\"basic.4y\";\"no\";\"no\";\"no\";\"telephone\…
##  2 "57;\"services\";\"married\";\"high.school\";\"unknown\";\"no\";\"no\";\"tel…
##  3 "37;\"services\";\"married\";\"high.school\";\"no\";\"yes\";\"no\";\"telepho…
##  4 "40;\"admin.\";\"married\";\"basic.6y\";\"no\";\"no\";\"no\";\"telephone\";\…
##  5 "56;\"services\";\"married\";\"high.school\";\"no\";\"no\";\"yes\";\"telepho…
##  6 "45;\"services\";\"married\";\"basic.9y\";\"unknown\";\"no\";\"no\";\"teleph…
##  7 "59;\"admin.\";\"married\";\"professional.course\";\"no\";\"no\";\"no\";\"te…
##  8 "41;\"blue-collar\";\"married\";\"unknown\";\"unknown\";\"no\";\"no\";\"tele…
##  9 "24;\"technician\";\"single\";\"professional.course\";\"no\";\"yes\";\"no\";…
## 10 "25;\"services\";\"single\";\"high.school\";\"no\";\"yes\";\"no\";\"telephon…
## # … with 41,178 more rows, and abbreviated variable name
## #   ¹​`age;job;marital;education;default;housing;loan;contact;month;day_of_week;duration;campaign;pdays;previous;poutcome;emp.var.rate;cons.price.idx;cons.conf.idx;euribor3m;nr.employed;y`

###Estrucrura de datos Por la estructura de los datos, podremos usar la función read.csv(), con el argumento sep = “;” para importarlos como un data frame.

banco <- read.csv(file = "bank-additional-full.csv",  sep = ";")

# Primeros datos
head(banco)
##   age       job marital   education default housing loan   contact month
## 1  56 housemaid married    basic.4y      no      no   no telephone   may
## 2  57  services married high.school unknown      no   no telephone   may
## 3  37  services married high.school      no     yes   no telephone   may
## 4  40    admin. married    basic.6y      no      no   no telephone   may
## 5  56  services married high.school      no      no  yes telephone   may
## 6  45  services married    basic.9y unknown      no   no telephone   may
##   day_of_week duration campaign pdays previous    poutcome emp.var.rate
## 1         mon      261        1   999        0 nonexistent          1.1
## 2         mon      149        1   999        0 nonexistent          1.1
## 3         mon      226        1   999        0 nonexistent          1.1
## 4         mon      151        1   999        0 nonexistent          1.1
## 5         mon      307        1   999        0 nonexistent          1.1
## 6         mon      198        1   999        0 nonexistent          1.1
##   cons.price.idx cons.conf.idx euribor3m nr.employed  y
## 1         93.994         -36.4     4.857        5191 no
## 2         93.994         -36.4     4.857        5191 no
## 3         93.994         -36.4     4.857        5191 no
## 4         93.994         -36.4     4.857        5191 no
## 5         93.994         -36.4     4.857        5191 no
## 6         93.994         -36.4     4.857        5191 no

Verificamos las dimensiones de los datos

# Dimensiones
dim(banco)
## [1] 41188    21

Verificamos los tipos de datos que tiene las columna de los datos.

# Tipos de datos

lapply(banco, class)
## $age
## [1] "integer"
## 
## $job
## [1] "character"
## 
## $marital
## [1] "character"
## 
## $education
## [1] "character"
## 
## $default
## [1] "character"
## 
## $housing
## [1] "character"
## 
## $loan
## [1] "character"
## 
## $contact
## [1] "character"
## 
## $month
## [1] "character"
## 
## $day_of_week
## [1] "character"
## 
## $duration
## [1] "integer"
## 
## $campaign
## [1] "integer"
## 
## $pdays
## [1] "integer"
## 
## $previous
## [1] "integer"
## 
## $poutcome
## [1] "character"
## 
## $emp.var.rate
## [1] "numeric"
## 
## $cons.price.idx
## [1] "numeric"
## 
## $cons.conf.idx
## [1] "numeric"
## 
## $euribor3m
## [1] "numeric"
## 
## $nr.employed
## [1] "numeric"
## 
## $y
## [1] "character"

Cambiamos los tipos de datos de las columnas para poder graficarlos sin inconvenientes y verificamos

banco <- banco %>% mutate_if(is.character, as.factor)

glimpse(banco)
## Rows: 41,188
## Columns: 21
## $ age            <int> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
## $ job            <fct> housemaid, services, services, admin., services, servic…
## $ marital        <fct> married, married, married, married, married, married, m…
## $ education      <fct> basic.4y, high.school, high.school, basic.6y, high.scho…
## $ default        <fct> no, unknown, no, no, no, unknown, no, unknown, no, no, …
## $ housing        <fct> no, no, yes, no, no, no, no, no, yes, yes, no, yes, no,…
## $ loan           <fct> no, no, no, no, yes, no, no, no, no, no, no, no, yes, n…
## $ contact        <fct> telephone, telephone, telephone, telephone, telephone, …
## $ month          <fct> may, may, may, may, may, may, may, may, may, may, may, …
## $ day_of_week    <fct> mon, mon, mon, mon, mon, mon, mon, mon, mon, mon, mon, …
## $ duration       <int> 261, 149, 226, 151, 307, 198, 139, 217, 380, 50, 55, 22…
## $ campaign       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pdays          <int> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
## $ previous       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poutcome       <fct> nonexistent, nonexistent, nonexistent, nonexistent, non…
## $ emp.var.rate   <dbl> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, …
## $ cons.price.idx <dbl> 93.994, 93.994, 93.994, 93.994, 93.994, 93.994, 93.994,…
## $ cons.conf.idx  <dbl> -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4,…
## $ euribor3m      <dbl> 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857,…
## $ nr.employed    <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
## $ y              <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
dim(banco)
## [1] 41188    21
lapply(banco, class)
## $age
## [1] "integer"
## 
## $job
## [1] "factor"
## 
## $marital
## [1] "factor"
## 
## $education
## [1] "factor"
## 
## $default
## [1] "factor"
## 
## $housing
## [1] "factor"
## 
## $loan
## [1] "factor"
## 
## $contact
## [1] "factor"
## 
## $month
## [1] "factor"
## 
## $day_of_week
## [1] "factor"
## 
## $duration
## [1] "integer"
## 
## $campaign
## [1] "integer"
## 
## $pdays
## [1] "integer"
## 
## $previous
## [1] "integer"
## 
## $poutcome
## [1] "factor"
## 
## $emp.var.rate
## [1] "numeric"
## 
## $cons.price.idx
## [1] "numeric"
## 
## $cons.conf.idx
## [1] "numeric"
## 
## $euribor3m
## [1] "numeric"
## 
## $nr.employed
## [1] "numeric"
## 
## $y
## [1] "factor"

Vemos un resumen de los datos

summary(banco)
##       age                 job            marital     
##  Min.   :17.00   admin.     :10422   divorced: 4612  
##  1st Qu.:32.00   blue-collar: 9254   married :24928  
##  Median :38.00   technician : 6743   single  :11568  
##  Mean   :40.02   services   : 3969   unknown :   80  
##  3rd Qu.:47.00   management : 2924                   
##  Max.   :98.00   retired    : 1720                   
##                  (Other)    : 6156                   
##                education        default         housing           loan      
##  university.degree  :12168   no     :32588   no     :18622   no     :33950  
##  high.school        : 9515   unknown: 8597   unknown:  990   unknown:  990  
##  basic.9y           : 6045   yes    :    3   yes    :21576   yes    : 6248  
##  professional.course: 5243                                                  
##  basic.4y           : 4176                                                  
##  basic.6y           : 2292                                                  
##  (Other)            : 1749                                                  
##       contact          month       day_of_week    duration     
##  cellular :26144   may    :13769   fri:7827    Min.   :   0.0  
##  telephone:15044   jul    : 7174   mon:8514    1st Qu.: 102.0  
##                    aug    : 6178   thu:8623    Median : 180.0  
##                    jun    : 5318   tue:8090    Mean   : 258.3  
##                    nov    : 4101   wed:8134    3rd Qu.: 319.0  
##                    apr    : 2632               Max.   :4918.0  
##                    (Other): 2016                               
##     campaign          pdays          previous            poutcome    
##  Min.   : 1.000   Min.   :  0.0   Min.   :0.000   failure    : 4252  
##  1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.000   nonexistent:35563  
##  Median : 2.000   Median :999.0   Median :0.000   success    : 1373  
##  Mean   : 2.568   Mean   :962.5   Mean   :0.173                      
##  3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.000                      
##  Max.   :56.000   Max.   :999.0   Max.   :7.000                      
##                                                                      
##   emp.var.rate      cons.price.idx  cons.conf.idx     euribor3m    
##  Min.   :-3.40000   Min.   :92.20   Min.   :-50.8   Min.   :0.634  
##  1st Qu.:-1.80000   1st Qu.:93.08   1st Qu.:-42.7   1st Qu.:1.344  
##  Median : 1.10000   Median :93.75   Median :-41.8   Median :4.857  
##  Mean   : 0.08189   Mean   :93.58   Mean   :-40.5   Mean   :3.621  
##  3rd Qu.: 1.40000   3rd Qu.:93.99   3rd Qu.:-36.4   3rd Qu.:4.961  
##  Max.   : 1.40000   Max.   :94.77   Max.   :-26.9   Max.   :5.045  
##                                                                    
##   nr.employed     y        
##  Min.   :4964   no :36548  
##  1st Qu.:5099   yes: 4640  
##  Median :5191              
##  Mean   :5167              
##  3rd Qu.:5228              
##  Max.   :5228              
## 

Gráfico 1_ Histograma de edad

hist(x = banco$age, main = "Histograma de Edad", 
     xlab = "Edad", ylab = "Frecuencia",
     col = "red")

Gráfico 2_ Histograma de duración

hist(x = banco$duration, main = "Histograma de Duración", 
     xlab = "Duración", ylab = "Frecuencia",
     col = "pink")

Gráfico 3_ Histograma de campaña

hist(x = banco$campaign, main = "Histograma de Campaña", 
     xlab = "Campaña", ylab = "Frecuencia",
     col = "blue")

Creamos una nueva tabla llamada tab_banco para los posteriores gráficos.

table(banco$education)
## 
##            basic.4y            basic.6y            basic.9y         high.school 
##                4176                2292                6045                9515 
##          illiterate professional.course   university.degree             unknown 
##                  18                5243               12168                1731
tab_banco <- table(banco$loan, banco$education)

# Resultado
tab_banco
##          
##           basic.4y basic.6y basic.9y high.school illiterate professional.course
##   no          3435     1902     5011        7852         15                4325
##   unknown      117       60      151         220          0                 125
##   yes          624      330      883        1443          3                 793
##          
##           university.degree unknown
##   no                   9961    1449
##   unknown               277      40
##   yes                  1930     242

Gráfico 4_ Frecuencia por nivel de educación

plot(x = banco$education, main = "Gráfica de Educacíón",
     xlab = "Nivel educativo", ylab = "Frecuencia", 
     col = c("royalblue", "pink", "purple", "darkturquoise", "peru", "red", "darkgreen","darkkhaki"))

Vemos cuántos valores únicos hay en este vector.

unique(banco$loan)
## [1] no      yes     unknown
## Levels: no unknown yes

A continuación, se busca obtener proporciones para la tabla para quienes sacaron prestamos

ptab_banco <- prop.table(tab_banco, margin = 2)

Grafico 5_ Proporciones de préstamo por nivel educativo

barplot(ptab_banco,  main = "Préstamos por nivel educativo",
     xlab = "Nivel educativo", ylab = "Proporción", 
     col = c("mediumturquoise", "slategrey", "mediumvioletred"))
legend(x = "bottomright", legend = c("No", "Yes", "unknown"), fill = c("mediumturquoise", "slategrey", "mediumvioletred"), 
       title = "Loan")

Veamos la relación entre las variables age y balance de banco

Gráfico 6_Relación de edad y balance con loan como variable de color

plot(x = banco$age, y = banco$balance, col= banco$loan)
legend(x = "topleft", legend = c("No", "Yes", "unknown"), fill = c("Black", "Red", "blue"), title = "Loan")

Gráfico 7_Diagrama de cajas de edad vs nivel educativo

plot(x = banco$education, y = banco$age, main = "Edad por nivel educativo", 
     xlab = "Nivel educativo", ylab = "Edad", 
     col = c("royalblue", "pink", "purple", "darkturquoise", "peru", "red", "darkgreen","darkkhaki"))

Gráfico 8_Gráfico de mosaico de estado matrimonial vs nivel educativo

plot(x = banco$marital, y = banco$education, main = "Nivel educativo por estado matrimonial ",
     xlab = "Estado matrimonial", ylab = "Nivel educativo", 
     col = c("darkgreen", "darkkhaki", "darkmagenta", "darkred", "darksalmon", "darkslateblue", "darkslategrey","darkturquoise"))