library(dplyr)
library(plotly)
library(foreign)
library(nnet)
data <- read.csv('bank-full.csv', sep = ";")
str(data)
## 'data.frame': 45211 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
## $ education: Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
## $ contact : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
head(data)
## age job marital education default balance housing loan contact
## 1 58 management married tertiary no 2143 yes no unknown
## 2 44 technician single secondary no 29 yes no unknown
## 3 33 entrepreneur married secondary no 2 yes yes unknown
## 4 47 blue-collar married unknown no 1506 yes no unknown
## 5 33 unknown single unknown no 1 no no unknown
## 6 35 management married tertiary no 231 yes no unknown
## day month duration campaign pdays previous poutcome y
## 1 5 may 261 1 -1 0 unknown no
## 2 5 may 151 1 -1 0 unknown no
## 3 5 may 76 1 -1 0 unknown no
## 4 5 may 92 1 -1 0 unknown no
## 5 5 may 198 1 -1 0 unknown no
## 6 5 may 139 1 -1 0 unknown no
Como podemos ver todas las variables estan correctamente asociadas a su tipo de dato.
summary(data)
## age job marital education
## Min. :18.00 blue-collar:9732 divorced: 5207 primary : 6851
## 1st Qu.:33.00 management :9458 married :27214 secondary:23202
## Median :39.00 technician :7597 single :12790 tertiary :13301
## Mean :40.94 admin. :5171 unknown : 1857
## 3rd Qu.:48.00 services :4154
## Max. :95.00 retired :2264
## (Other) :6835
## default balance housing loan contact
## no :44396 Min. : -8019 no :20081 no :37967 cellular :29285
## yes: 815 1st Qu.: 72 yes:25130 yes: 7244 telephone: 2906
## Median : 448 unknown :13020
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
##
## day month duration campaign
## Min. : 1.00 may :13766 Min. : 0.0 Min. : 1.000
## 1st Qu.: 8.00 jul : 6895 1st Qu.: 103.0 1st Qu.: 1.000
## Median :16.00 aug : 6247 Median : 180.0 Median : 2.000
## Mean :15.81 jun : 5341 Mean : 258.2 Mean : 2.764
## 3rd Qu.:21.00 nov : 3970 3rd Qu.: 319.0 3rd Qu.: 3.000
## Max. :31.00 apr : 2932 Max. :4918.0 Max. :63.000
## (Other): 6060
## pdays previous poutcome y
## Min. : -1.0 Min. : 0.0000 failure: 4901 no :39922
## 1st Qu.: -1.0 1st Qu.: 0.0000 other : 1840 yes: 5289
## Median : -1.0 Median : 0.0000 success: 1511
## Mean : 40.2 Mean : 0.5803 unknown:36959
## 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :871.0 Max. :275.0000
##
plot_ly(x=~data$age, y=~data$balance, color=~data$loan) %>% layout(xaxis=list(title="Edad"), yaxis=list("Balance"))
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
Podemos ver que los que tienen un préstamo (azul) tienen en promedio un menor balance que las personas que no tienen préstamo.
ggplot(data, aes(x=loan, fill=marital)) +
geom_bar()
La mayor cantidad de préstamos lo tienen las personas casadas.
regresion <- multinom(loan ~ ., data=data)
## # weights: 44 (43 variable)
## initial value 31337.877180
## iter 10 value 20559.047130
## iter 20 value 19985.007495
## iter 30 value 18947.047349
## iter 40 value 18544.672182
## iter 50 value 18451.975223
## final value 18450.461505
## converged
summary(regresion)
## Call:
## multinom(formula = loan ~ ., data = data)
##
## Coefficients:
## Values Std. Err.
## (Intercept) -1.300077e+00 1.404012e-02
## age -4.186412e-03 1.147322e-03
## jobblue-collar -1.047213e-01 3.160997e-02
## jobentrepreneur 2.457885e-01 2.733977e-02
## jobhousemaid -4.906332e-01 6.914731e-03
## jobmanagement -2.226319e-01 3.616676e-02
## jobretired -7.559845e-02 1.507766e-02
## jobself-employed -2.565069e-01 1.311818e-02
## jobservices -1.432913e-02 4.197344e-02
## jobstudent -2.392930e+00 6.956150e-04
## jobtechnician -3.204032e-02 3.460938e-02
## jobunemployed -8.900265e-01 4.214820e-03
## jobunknown -2.313332e+00 1.635082e-04
## maritalmarried 2.630685e-02 2.953558e-02
## maritalsingle -2.638775e-01 2.997406e-02
## educationsecondary 2.180121e-01 2.814507e-02
## educationtertiary 3.067803e-02 2.735821e-02
## educationunknown -6.860307e-01 7.557660e-03
## defaultyes 8.424469e-01 2.824845e-03
## balance -1.330591e-04 8.628046e-06
## housingyes 6.008065e-02 3.047806e-02
## contacttelephone -2.193715e-01 4.146862e-02
## contactunknown 1.917125e-01 2.323777e-02
## day -6.350332e-03 1.670773e-03
## monthaug -2.205327e-01 3.896876e-02
## monthdec -5.383206e-01 4.857025e-04
## monthfeb 2.025265e-01 4.187877e-02
## monthjan 3.271401e-01 1.165751e-02
## monthjul 1.101415e+00 2.992647e-02
## monthjun -3.102753e-02 2.383361e-02
## monthmar -4.563420e-01 1.621575e-03
## monthmay -5.406414e-02 2.719540e-02
## monthnov 5.605452e-01 4.066030e-02
## monthoct -1.643868e-01 2.575114e-03
## monthsep -5.521483e-01 1.306685e-03
## duration 6.356353e-05 5.746313e-05
## campaign -2.366910e-04 4.301092e-03
## pdays -2.086950e-04 1.716618e-04
## previous 8.043740e-03 5.752166e-03
## poutcomeother -6.146518e-02 6.086343e-03
## poutcomesuccess -7.936038e-01 7.008221e-03
## poutcomeunknown -1.913042e-01 1.757785e-02
## yyes -4.138714e-01 5.368691e-02
##
## Residual Deviance: 36900.92
## AIC: 36986.92
Podemos ver que si quiero darle un préstamo a una persona, necesito que tenga las siguientes características: - Educación secundaria o terciaria - Trabajo en emprendimiento - Estado civil: Casado/a
Sobre estas personas entonces se puede trabajar en su balance y tipo de cliente.
cliente <- data %>% filter(education=="tertiary") %>% filter(job=="entrepreneur") %>% filter(marital=="married")
nrow(cliente) #Tenemos 453 nuevos clientes potenciales
## [1] 453
a <- cliente %>% group_by(age) %>% summarise(freq=n())
a #Vemos que son clientes de 23 a 71 años
## Source: local data frame [42 x 2]
##
## age freq
## (int) (int)
## 1 23 1
## 2 25 3
## 3 26 4
## 4 27 5
## 5 28 3
## 6 29 2
## 7 30 7
## 8 31 10
## 9 32 15
## 10 33 18
## .. ... ...
b <- cliente %>% group_by(age, balance) %>% summarise(freq=n())
b
## Source: local data frame [406 x 3]
## Groups: age [?]
##
## age balance freq
## (int) (int) (int)
## 1 23 489 1
## 2 25 30 1
## 3 25 37 1
## 4 25 38 1
## 5 26 37 1
## 6 26 38 1
## 7 26 79 1
## 8 26 144 1
## 9 27 -23 1
## 10 27 59 1
## .. ... ... ...
plot_ly(cliente, x=~age, y=~balance)
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
#Si hacemos un saldo final promedio entre todos los clientes nuevos encontramos que:
#saldo promedio de los nuevos clientes
sum(cliente$balance)/nrow(cliente)
## [1] 2046.152
Se puede ver que si es factible si tomamos el rubro del balance ya que nos dice que en promedio los nuevos clientes elegidos si podrán pagar un préstamo que se les facilite dentro del banco y ésto generará más ingresos para el mismo.