datos <- read.csv("datos.csv")
datos= datos [, c(1,2,3,4,5,6,7,9,10,11,8)]
head(datos)
## X DIRECTORIO P6020 P6040 P5502 P6071 P5667 CANT_PERSONAS_HOGAR I_HOGAR
## 1 1 7120001 2 44 2 1 4 3800000.0
## 2 2 7120002 1 61 6 1 10 3656652.8
## 3 3 7120005 1 79 6 1 2 4200000.0
## 4 4 7120006 2 37 5 NA 3 1365000.0
## 5 5 7120007 1 72 5 NA 3 765833.3
## 6 6 7120008 1 73 5 NA 1 1785721.9
## P5010 Hijos
## 1 2 1
## 2 4 5
## 3 1 0
## 4 2 0
## 5 3 0
## 6 1 0
X1= directorio X2= p6020 X3= p6040 X4= p5502 X5= p6071 X6= P5667 X6= cant_personas_hogar X7= i_hogar X8= p5010 Y= hijos
colnames(datos) <- c("X", "X1", "X2","X3", "X4","X5", "X6","X7","X8", "X9", "Y")
head(datos)
## X X1 X2 X3 X4 X5 X6 X7 X8 X9 Y
## 1 1 7120001 2 44 2 1 4 3800000.0 2 1
## 2 2 7120002 1 61 6 1 10 3656652.8 4 5
## 3 3 7120005 1 79 6 1 2 4200000.0 1 0
## 4 4 7120006 2 37 5 NA 3 1365000.0 2 0
## 5 5 7120007 1 72 5 NA 3 765833.3 3 0
## 6 6 7120008 1 73 5 NA 1 1785721.9 1 0
datos_sel <- subset(datos, select= c('X2','X3','X4','X5','X6','X7','X8','Y'))
library(knitr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(keras)
library(lime)
##
## Attaching package: 'lime'
## The following object is masked from 'package:dplyr':
##
## explain
library(tidyquant)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## == Need to Learn tidyquant? ====================================================
## Business Science offers a 1-hour course - Learning Lab #9: Performance Analysis & Portfolio Optimization with tidyquant!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(rsample)
library(recipes)
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
##
## fixed
## The following object is masked from 'package:stats':
##
## step
library(yardstick)
## For binary classification, the first factor level is assumed to be the event.
## Use the argument `event_level = "second"` to alter this as needed.
##
## Attaching package: 'yardstick'
## The following object is masked from 'package:keras':
##
## get_weights
## The following object is masked from 'package:readr':
##
## spec
library(corrr)
glimpse(datos_sel)
## Rows: 93,160
## Columns: 8
## $ X2 <int> 2, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1~
## $ X3 <int> 44, 61, 79, 37, 72, 73, 74, 41, 26, 39, 24, 60, 65, 35, 35, 55, 64,~
## $ X4 <int> 2, 6, 6, 5, 5, 5, 3, 2, 4, 4, 2, 3, 6, 6, 2, 2, 2, 2, 5, 4, 2, 5, 6~
## $ X5 <int> 1, 1, 1, NA, NA, NA, NA, 1, NA, NA, 1, NA, 1, 1, 1, 1, 1, 1, NA, NA~
## $ X6 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",~
## $ X7 <int> 4, 10, 2, 3, 3, 1, 2, 5, 4, 3, 4, 6, 2, 4, 4, 3, 3, 2, 1, 2, 3, 3, ~
## $ X8 <dbl> 3800000.0, 3656652.8, 4200000.0, 1365000.0, 765833.3, 1785721.9, 18~
## $ Y <int> 1, 5, 0, 0, 0, 0, 1, 3, 0, 2, 2, 1, 0, 2, 2, 1, 0, 0, 0, 1, 1, 0, 1~
dataset <- datos_sel %>%
select(-X6) %>%
drop_na() %>%
select (Y, everything())
glimpse(dataset)
## Rows: 57,410
## Columns: 7
## $ Y <int> 1, 5, 0, 3, 2, 0, 2, 2, 1, 0, 0, 1, 1, 2, 0, 4, 0, 0, 2, 2, 2, 2, 1~
## $ X2 <int> 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1~
## $ X3 <int> 44, 61, 79, 41, 24, 65, 35, 35, 55, 64, 34, 31, 56, 58, 76, 42, 33,~
## $ X4 <int> 2, 6, 6, 2, 2, 6, 6, 2, 2, 2, 2, 2, 6, 1, 6, 6, 6, 2, 6, 2, 6, 6, 6~
## $ X5 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1~
## $ X7 <int> 4, 10, 2, 5, 4, 2, 4, 4, 3, 3, 2, 3, 3, 4, 2, 6, 1, 2, 4, 4, 4, 4, ~
## $ X8 <dbl> 3800000.0, 3656652.8, 4200000.0, 7150000.0, 722500.0, 1533333.3, 14~
set.seed(840)
train_test_split <- initial_split(dataset, prop=0.9)
train_test_split
## <Analysis/Assess/Total>
## <51670/5740/57410>
train_tbl <- training(train_test_split)
test_tbl <- testing(train_test_split)
rec_obj <- recipe( Y ~ ., data= train_tbl) %>%
step_center(all_predictors(), -all_outcomes()) %>%
step_scale(all_predictors(), -all_outcomes()) %>%
prep(data = train_tbl)
rec_obj
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 6
##
## Training data contained 51670 data points and no missing data.
##
## Operations:
##
## Centering for X2, X3, X4, X5, X7, X8 [trained]
## Scaling for X2, X3, X4, X5, X7, X8 [trained]
x_train_tbl <- bake(rec_obj, new_data = train_tbl) %>% select(-Y)
x_test_tbl <- bake(rec_obj, new_data = test_tbl) %>% select(-Y)
glimpse(x_train_tbl)
## Rows: 51,670
## Columns: 6
## $ X2 <dbl> 2.0599694, -0.4854347, -0.4854347, -0.4854347, -0.4854347, -0.48543~
## $ X3 <dbl> -0.206663487, 0.907797093, 2.087814177, -0.403333001, -1.517793581,~
## $ X4 <dbl> -0.7433086, 1.2781794, 1.2781794, -0.7433086, -0.7433086, -0.743308~
## $ X5 <dbl> -0.2941205, -0.2941205, -0.2941205, -0.2941205, -0.2941205, -0.2941~
## $ X7 <dbl> 0.2486192, 4.0443904, -1.0166378, 0.8812477, 0.2486192, 0.2486192, ~
## $ X8 <dbl> 0.57650907, 0.53252462, 0.69924450, 1.60441835, -0.36778671, 0.0789~
y_train_vec <- pull(train_tbl, Y)
y_test_vec <- pull(test_tbl, Y)
ncol(x_train_tbl)
## [1] 6
# Building our Artificial Neural Network
model_keras <- keras_model_sequential()
model_keras %>%
# First hidden layer
layer_dense(
units = 16,
kernel_initializer = "uniform",
activation = "relu",
input_shape = ncol(x_train_tbl)) %>%
# Dropout to prevent overfitting
layer_dropout(rate = 0.1) %>%
# Second hidden layer
layer_dense(
units = 16,
kernel_initializer = "uniform",
activation = "relu") %>%
# Dropout to prevent overfitting
layer_dropout(rate = 0.1) %>%
# Output layer
layer_dense(
units = 1,
kernel_initializer = "uniform",
activation = "sigmoid") %>%
# Compile ANN
compile(
optimizer = 'adam',
loss = 'binary_crossentropy',
metrics = c('accuracy')
)
model_keras
## Model
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_2 (Dense) (None, 16) 112
## ________________________________________________________________________________
## dropout_1 (Dropout) (None, 16) 0
## ________________________________________________________________________________
## dense_1 (Dense) (None, 16) 272
## ________________________________________________________________________________
## dropout (Dropout) (None, 16) 0
## ________________________________________________________________________________
## dense (Dense) (None, 1) 17
## ================================================================================
## Total params: 401
## Trainable params: 401
## Non-trainable params: 0
## ________________________________________________________________________________
# Fit the keras model to the training data
history <- fit(
object = model_keras,
x = as.matrix(x_train_tbl),
y = y_train_vec,
batch_size = 50,
epochs = 35,
validation_split = 0.10
)
plot(history)
## `geom_smooth()` using formula 'y ~ x'
model_keras %>% evaluate(as.matrix(x_train_tbl), y_train_vec)
## loss accuracy
## -1.374479e+07 2.986646e-01