##Introducción
En primer lugar validaremos si las librerías que usaremos están instaladas y activadas para ser utilizadas en esta sesión.
if (!require("tidyverse"))
{ install.packages("tidyverse")
library(tidyverse)}
if (!require("miscset"))
{ install.packages("miscset")
library(miscset)}
if (!require("car"))
{ install.packages("car")
library(car)}
if (!require("Hmisc"))
{ install.packages("Hmisc")
library(Hmisc)}
# Setting the working directory
path_loc <- "C:\\TEMPORAL\\6. ESTADÍSTICA\\3. R Studio"
setwd(path_loc)
# reading in the data
df <- read_csv("bank-additional-full.csv")
## Rows: 41188 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): age;job;marital;education;default;housing;loan;contact;month;day_of...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# dimensions of the data
dim_desc(df)
## [1] "[41,188 x 1]"
# names of the data
names(df)
## [1] "age;job;marital;education;default;housing;loan;contact;month;day_of_week;duration;campaign;pdays;previous;poutcome;emp.var.rate;cons.price.idx;cons.conf.idx;euribor3m;nr.employed;y"
readLines("bank-additional-full.csv", n = 4)
## [1] "\"age\";\"job\";\"marital\";\"education\";\"default\";\"housing\";\"loan\";\"contact\";\"month\";\"day_of_week\";\"duration\";\"campaign\";\"pdays\";\"previous\";\"poutcome\";\"emp.var.rate\";\"cons.price.idx\";\"cons.conf.idx\";\"euribor3m\";\"nr.employed\";\"y\""
## [2] "56;\"housemaid\";\"married\";\"basic.4y\";\"no\";\"no\";\"no\";\"telephone\";\"may\";\"mon\";261;1;999;0;\"nonexistent\";1.1;93.994;-36.4;4.857;5191;\"no\""
## [3] "57;\"services\";\"married\";\"high.school\";\"unknown\";\"no\";\"no\";\"telephone\";\"may\";\"mon\";149;1;999;0;\"nonexistent\";1.1;93.994;-36.4;4.857;5191;\"no\""
## [4] "37;\"services\";\"married\";\"high.school\";\"no\";\"yes\";\"no\";\"telephone\";\"may\";\"mon\";226;1;999;0;\"nonexistent\";1.1;93.994;-36.4;4.857;5191;\"no\""
df
## # A tibble: 41,188 × 1
## age;job;marital;education;default;housing;loan;contact;month;day_of_week;du…¹
## <chr>
## 1 "56;\"housemaid\";\"married\";\"basic.4y\";\"no\";\"no\";\"no\";\"telephone\…
## 2 "57;\"services\";\"married\";\"high.school\";\"unknown\";\"no\";\"no\";\"tel…
## 3 "37;\"services\";\"married\";\"high.school\";\"no\";\"yes\";\"no\";\"telepho…
## 4 "40;\"admin.\";\"married\";\"basic.6y\";\"no\";\"no\";\"no\";\"telephone\";\…
## 5 "56;\"services\";\"married\";\"high.school\";\"no\";\"no\";\"yes\";\"telepho…
## 6 "45;\"services\";\"married\";\"basic.9y\";\"unknown\";\"no\";\"no\";\"teleph…
## 7 "59;\"admin.\";\"married\";\"professional.course\";\"no\";\"no\";\"no\";\"te…
## 8 "41;\"blue-collar\";\"married\";\"unknown\";\"unknown\";\"no\";\"no\";\"tele…
## 9 "24;\"technician\";\"single\";\"professional.course\";\"no\";\"yes\";\"no\";…
## 10 "25;\"services\";\"single\";\"high.school\";\"no\";\"yes\";\"no\";\"telephon…
## # … with 41,178 more rows, and abbreviated variable name
## # ¹`age;job;marital;education;default;housing;loan;contact;month;day_of_week;duration;campaign;pdays;previous;poutcome;emp.var.rate;cons.price.idx;cons.conf.idx;euribor3m;nr.employed;y`
###Estrucrura de datos Por la estructura de los datos, podremos usar la función read.csv(), con el argumento sep = “;” para importarlos como un data frame.
banco <- read.csv(file = "bank-additional-full.csv", sep = ";")
# Primeros datos
head(banco)
## age job marital education default housing loan contact month
## 1 56 housemaid married basic.4y no no no telephone may
## 2 57 services married high.school unknown no no telephone may
## 3 37 services married high.school no yes no telephone may
## 4 40 admin. married basic.6y no no no telephone may
## 5 56 services married high.school no no yes telephone may
## 6 45 services married basic.9y unknown no no telephone may
## day_of_week duration campaign pdays previous poutcome emp.var.rate
## 1 mon 261 1 999 0 nonexistent 1.1
## 2 mon 149 1 999 0 nonexistent 1.1
## 3 mon 226 1 999 0 nonexistent 1.1
## 4 mon 151 1 999 0 nonexistent 1.1
## 5 mon 307 1 999 0 nonexistent 1.1
## 6 mon 198 1 999 0 nonexistent 1.1
## cons.price.idx cons.conf.idx euribor3m nr.employed y
## 1 93.994 -36.4 4.857 5191 no
## 2 93.994 -36.4 4.857 5191 no
## 3 93.994 -36.4 4.857 5191 no
## 4 93.994 -36.4 4.857 5191 no
## 5 93.994 -36.4 4.857 5191 no
## 6 93.994 -36.4 4.857 5191 no
# Dimensiones
dim(banco)
## [1] 41188 21
# Tipos de datos
lapply(banco, class)
## $age
## [1] "integer"
##
## $job
## [1] "character"
##
## $marital
## [1] "character"
##
## $education
## [1] "character"
##
## $default
## [1] "character"
##
## $housing
## [1] "character"
##
## $loan
## [1] "character"
##
## $contact
## [1] "character"
##
## $month
## [1] "character"
##
## $day_of_week
## [1] "character"
##
## $duration
## [1] "integer"
##
## $campaign
## [1] "integer"
##
## $pdays
## [1] "integer"
##
## $previous
## [1] "integer"
##
## $poutcome
## [1] "character"
##
## $emp.var.rate
## [1] "numeric"
##
## $cons.price.idx
## [1] "numeric"
##
## $cons.conf.idx
## [1] "numeric"
##
## $euribor3m
## [1] "numeric"
##
## $nr.employed
## [1] "numeric"
##
## $y
## [1] "character"
banco <- banco %>% mutate_if(is.character, as.factor)
glimpse(banco)
## Rows: 41,188
## Columns: 21
## $ age <int> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
## $ job <fct> housemaid, services, services, admin., services, servic…
## $ marital <fct> married, married, married, married, married, married, m…
## $ education <fct> basic.4y, high.school, high.school, basic.6y, high.scho…
## $ default <fct> no, unknown, no, no, no, unknown, no, unknown, no, no, …
## $ housing <fct> no, no, yes, no, no, no, no, no, yes, yes, no, yes, no,…
## $ loan <fct> no, no, no, no, yes, no, no, no, no, no, no, no, yes, n…
## $ contact <fct> telephone, telephone, telephone, telephone, telephone, …
## $ month <fct> may, may, may, may, may, may, may, may, may, may, may, …
## $ day_of_week <fct> mon, mon, mon, mon, mon, mon, mon, mon, mon, mon, mon, …
## $ duration <int> 261, 149, 226, 151, 307, 198, 139, 217, 380, 50, 55, 22…
## $ campaign <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pdays <int> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
## $ previous <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poutcome <fct> nonexistent, nonexistent, nonexistent, nonexistent, non…
## $ emp.var.rate <dbl> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, …
## $ cons.price.idx <dbl> 93.994, 93.994, 93.994, 93.994, 93.994, 93.994, 93.994,…
## $ cons.conf.idx <dbl> -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4,…
## $ euribor3m <dbl> 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857,…
## $ nr.employed <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
## $ y <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
dim(banco)
## [1] 41188 21
lapply(banco, class)
## $age
## [1] "integer"
##
## $job
## [1] "factor"
##
## $marital
## [1] "factor"
##
## $education
## [1] "factor"
##
## $default
## [1] "factor"
##
## $housing
## [1] "factor"
##
## $loan
## [1] "factor"
##
## $contact
## [1] "factor"
##
## $month
## [1] "factor"
##
## $day_of_week
## [1] "factor"
##
## $duration
## [1] "integer"
##
## $campaign
## [1] "integer"
##
## $pdays
## [1] "integer"
##
## $previous
## [1] "integer"
##
## $poutcome
## [1] "factor"
##
## $emp.var.rate
## [1] "numeric"
##
## $cons.price.idx
## [1] "numeric"
##
## $cons.conf.idx
## [1] "numeric"
##
## $euribor3m
## [1] "numeric"
##
## $nr.employed
## [1] "numeric"
##
## $y
## [1] "factor"
summary(banco)
## age job marital
## Min. :17.00 admin. :10422 divorced: 4612
## 1st Qu.:32.00 blue-collar: 9254 married :24928
## Median :38.00 technician : 6743 single :11568
## Mean :40.02 services : 3969 unknown : 80
## 3rd Qu.:47.00 management : 2924
## Max. :98.00 retired : 1720
## (Other) : 6156
## education default housing loan
## university.degree :12168 no :32588 no :18622 no :33950
## high.school : 9515 unknown: 8597 unknown: 990 unknown: 990
## basic.9y : 6045 yes : 3 yes :21576 yes : 6248
## professional.course: 5243
## basic.4y : 4176
## basic.6y : 2292
## (Other) : 1749
## contact month day_of_week duration
## cellular :26144 may :13769 fri:7827 Min. : 0.0
## telephone:15044 jul : 7174 mon:8514 1st Qu.: 102.0
## aug : 6178 thu:8623 Median : 180.0
## jun : 5318 tue:8090 Mean : 258.3
## nov : 4101 wed:8134 3rd Qu.: 319.0
## apr : 2632 Max. :4918.0
## (Other): 2016
## campaign pdays previous poutcome
## Min. : 1.000 Min. : 0.0 Min. :0.000 failure : 4252
## 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.000 nonexistent:35563
## Median : 2.000 Median :999.0 Median :0.000 success : 1373
## Mean : 2.568 Mean :962.5 Mean :0.173
## 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.000
## Max. :56.000 Max. :999.0 Max. :7.000
##
## emp.var.rate cons.price.idx cons.conf.idx euribor3m
## Min. :-3.40000 Min. :92.20 Min. :-50.8 Min. :0.634
## 1st Qu.:-1.80000 1st Qu.:93.08 1st Qu.:-42.7 1st Qu.:1.344
## Median : 1.10000 Median :93.75 Median :-41.8 Median :4.857
## Mean : 0.08189 Mean :93.58 Mean :-40.5 Mean :3.621
## 3rd Qu.: 1.40000 3rd Qu.:93.99 3rd Qu.:-36.4 3rd Qu.:4.961
## Max. : 1.40000 Max. :94.77 Max. :-26.9 Max. :5.045
##
## nr.employed y
## Min. :4964 no :36548
## 1st Qu.:5099 yes: 4640
## Median :5191
## Mean :5167
## 3rd Qu.:5228
## Max. :5228
##
hist(x = banco$age, main = "Histograma de Edad",
xlab = "Edad", ylab = "Frecuencia",
col = "red")
hist(x = banco$duration, main = "Histograma de Duración",
xlab = "Duración", ylab = "Frecuencia",
col = "pink")
hist(x = banco$campaign, main = "Histograma de Campaña",
xlab = "Campaña", ylab = "Frecuencia",
col = "blue")
table(banco$education)
##
## basic.4y basic.6y basic.9y high.school
## 4176 2292 6045 9515
## illiterate professional.course university.degree unknown
## 18 5243 12168 1731
tab_banco <- table(banco$loan, banco$education)
# Resultado
tab_banco
##
## basic.4y basic.6y basic.9y high.school illiterate professional.course
## no 3435 1902 5011 7852 15 4325
## unknown 117 60 151 220 0 125
## yes 624 330 883 1443 3 793
##
## university.degree unknown
## no 9961 1449
## unknown 277 40
## yes 1930 242
plot(x = banco$education, main = "Gráfica de Educacíón",
xlab = "Nivel educativo", ylab = "Frecuencia",
col = c("royalblue", "pink", "purple", "darkturquoise", "peru", "red", "darkgreen","darkkhaki"))
unique(banco$loan)
## [1] no yes unknown
## Levels: no unknown yes
ptab_banco <- prop.table(tab_banco, margin = 2)
barplot(ptab_banco, main = "Préstamos por nivel educativo",
xlab = "Nivel educativo", ylab = "Proporción",
col = c("mediumturquoise", "slategrey", "mediumvioletred"))
legend(x = "bottomright", legend = c("No", "Yes", "unknown"), fill = c("mediumturquoise", "slategrey", "mediumvioletred"),
title = "Loan")
plot(x = banco$age, y = banco$balance, col= banco$loan)
legend(x = "topleft", legend = c("No", "Yes", "unknown"), fill = c("Black", "Red", "blue"), title = "Loan")
plot(x = banco$education, y = banco$age, main = "Edad por nivel educativo",
xlab = "Nivel educativo", ylab = "Edad",
col = c("royalblue", "pink", "purple", "darkturquoise", "peru", "red", "darkgreen","darkkhaki"))
plot(x = banco$marital, y = banco$education, main = "Nivel educativo por estado matrimonial ",
xlab = "Estado matrimonial", ylab = "Nivel educativo",
col = c("darkgreen", "darkkhaki", "darkmagenta", "darkred", "darksalmon", "darkslateblue", "darkslategrey","darkturquoise"))