datos <- read.csv("datos.csv")
datos= datos [, c(1,2,3,4,5,6,7,9,10,11,8)]
head(datos)
##   X DIRECTORIO P6020 P6040 P5502 P6071 P5667 CANT_PERSONAS_HOGAR   I_HOGAR
## 1 1    7120001     2    44     2     1                         4 3800000.0
## 2 2    7120002     1    61     6     1                        10 3656652.8
## 3 3    7120005     1    79     6     1                         2 4200000.0
## 4 4    7120006     2    37     5    NA                         3 1365000.0
## 5 5    7120007     1    72     5    NA                         3  765833.3
## 6 6    7120008     1    73     5    NA                         1 1785721.9
##   P5010 Hijos
## 1     2     1
## 2     4     5
## 3     1     0
## 4     2     0
## 5     3     0
## 6     1     0

X1= directorio X2= p6020 X3= p6040 X4= p5502 X5= p6071 X6= P5667 X6= cant_personas_hogar X7= i_hogar X8= p5010 Y= hijos

colnames(datos) <- c("X", "X1", "X2","X3", "X4","X5", "X6","X7","X8", "X9", "Y")
head(datos)
##   X      X1 X2 X3 X4 X5 X6 X7        X8 X9 Y
## 1 1 7120001  2 44  2  1     4 3800000.0  2 1
## 2 2 7120002  1 61  6  1    10 3656652.8  4 5
## 3 3 7120005  1 79  6  1     2 4200000.0  1 0
## 4 4 7120006  2 37  5 NA     3 1365000.0  2 0
## 5 5 7120007  1 72  5 NA     3  765833.3  3 0
## 6 6 7120008  1 73  5 NA     1 1785721.9  1 0
datos_sel <- subset(datos, select= c('X2','X3','X4','X5','X6','X7','X8','Y'))
library(knitr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(keras)
library(lime)
## 
## Attaching package: 'lime'
## The following object is masked from 'package:dplyr':
## 
##     explain
library(tidyquant)
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## == Need to Learn tidyquant? ====================================================
## Business Science offers a 1-hour course - Learning Lab #9: Performance Analysis & Portfolio Optimization with tidyquant!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(rsample)
library(recipes)
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
## 
##     fixed
## The following object is masked from 'package:stats':
## 
##     step
library(yardstick)
## For binary classification, the first factor level is assumed to be the event.
## Use the argument `event_level = "second"` to alter this as needed.
## 
## Attaching package: 'yardstick'
## The following object is masked from 'package:keras':
## 
##     get_weights
## The following object is masked from 'package:readr':
## 
##     spec
library(corrr)
glimpse(datos_sel)
## Rows: 93,160
## Columns: 8
## $ X2 <int> 2, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1~
## $ X3 <int> 44, 61, 79, 37, 72, 73, 74, 41, 26, 39, 24, 60, 65, 35, 35, 55, 64,~
## $ X4 <int> 2, 6, 6, 5, 5, 5, 3, 2, 4, 4, 2, 3, 6, 6, 2, 2, 2, 2, 5, 4, 2, 5, 6~
## $ X5 <int> 1, 1, 1, NA, NA, NA, NA, 1, NA, NA, 1, NA, 1, 1, 1, 1, 1, 1, NA, NA~
## $ X6 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",~
## $ X7 <int> 4, 10, 2, 3, 3, 1, 2, 5, 4, 3, 4, 6, 2, 4, 4, 3, 3, 2, 1, 2, 3, 3, ~
## $ X8 <dbl> 3800000.0, 3656652.8, 4200000.0, 1365000.0, 765833.3, 1785721.9, 18~
## $ Y  <int> 1, 5, 0, 0, 0, 0, 1, 3, 0, 2, 2, 1, 0, 2, 2, 1, 0, 0, 0, 1, 1, 0, 1~
dataset <- datos_sel %>%
   select(-X6) %>%
   drop_na() %>%
   select (Y, everything())

glimpse(dataset)
## Rows: 57,410
## Columns: 7
## $ Y  <int> 1, 5, 0, 3, 2, 0, 2, 2, 1, 0, 0, 1, 1, 2, 0, 4, 0, 0, 2, 2, 2, 2, 1~
## $ X2 <int> 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1~
## $ X3 <int> 44, 61, 79, 41, 24, 65, 35, 35, 55, 64, 34, 31, 56, 58, 76, 42, 33,~
## $ X4 <int> 2, 6, 6, 2, 2, 6, 6, 2, 2, 2, 2, 2, 6, 1, 6, 6, 6, 2, 6, 2, 6, 6, 6~
## $ X5 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1~
## $ X7 <int> 4, 10, 2, 5, 4, 2, 4, 4, 3, 3, 2, 3, 3, 4, 2, 6, 1, 2, 4, 4, 4, 4, ~
## $ X8 <dbl> 3800000.0, 3656652.8, 4200000.0, 7150000.0, 722500.0, 1533333.3, 14~
set.seed(840)
train_test_split <- initial_split(dataset, prop=0.9)
train_test_split
## <Analysis/Assess/Total>
## <51670/5740/57410>
train_tbl <- training(train_test_split)
test_tbl  <- testing(train_test_split)
rec_obj <- recipe( Y ~ ., data= train_tbl) %>%
   
   step_center(all_predictors(), -all_outcomes()) %>%
   step_scale(all_predictors(), -all_outcomes()) %>%
   prep(data = train_tbl)
rec_obj
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          6
## 
## Training data contained 51670 data points and no missing data.
## 
## Operations:
## 
## Centering for X2, X3, X4, X5, X7, X8 [trained]
## Scaling for X2, X3, X4, X5, X7, X8 [trained]
x_train_tbl <- bake(rec_obj, new_data = train_tbl) %>% select(-Y)
x_test_tbl  <- bake(rec_obj, new_data = test_tbl) %>% select(-Y)

glimpse(x_train_tbl)
## Rows: 51,670
## Columns: 6
## $ X2 <dbl> 2.0599694, -0.4854347, -0.4854347, -0.4854347, -0.4854347, -0.48543~
## $ X3 <dbl> -0.206663487, 0.907797093, 2.087814177, -0.403333001, -1.517793581,~
## $ X4 <dbl> -0.7433086, 1.2781794, 1.2781794, -0.7433086, -0.7433086, -0.743308~
## $ X5 <dbl> -0.2941205, -0.2941205, -0.2941205, -0.2941205, -0.2941205, -0.2941~
## $ X7 <dbl> 0.2486192, 4.0443904, -1.0166378, 0.8812477, 0.2486192, 0.2486192, ~
## $ X8 <dbl> 0.57650907, 0.53252462, 0.69924450, 1.60441835, -0.36778671, 0.0789~
y_train_vec <- pull(train_tbl, Y)
y_test_vec <- pull(test_tbl, Y)
 ncol(x_train_tbl)
## [1] 6
# Building our Artificial Neural Network
model_keras <- keras_model_sequential()

model_keras %>% 
  
  # First hidden layer
  layer_dense(
    units              = 16, 
    kernel_initializer = "uniform", 
    activation         = "relu", 
    input_shape        = ncol(x_train_tbl)) %>% 
  
  # Dropout to prevent overfitting
  layer_dropout(rate = 0.1) %>%
  
  # Second hidden layer
  layer_dense(
    units              = 16, 
    kernel_initializer = "uniform", 
    activation         = "relu") %>% 
  
  # Dropout to prevent overfitting
  layer_dropout(rate = 0.1) %>%
  
  # Output layer
  layer_dense(
    units              = 1, 
    kernel_initializer = "uniform", 
    activation         = "sigmoid") %>% 
  
  # Compile ANN
  compile(
    optimizer = 'adam',
    loss      = 'binary_crossentropy',
    metrics   = c('accuracy')
  )
model_keras
## Model
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_2 (Dense)                     (None, 16)                      112         
## ________________________________________________________________________________
## dropout_1 (Dropout)                 (None, 16)                      0           
## ________________________________________________________________________________
## dense_1 (Dense)                     (None, 16)                      272         
## ________________________________________________________________________________
## dropout (Dropout)                   (None, 16)                      0           
## ________________________________________________________________________________
## dense (Dense)                       (None, 1)                       17          
## ================================================================================
## Total params: 401
## Trainable params: 401
## Non-trainable params: 0
## ________________________________________________________________________________
# Fit the keras model to the training data
history <- fit(
  object           = model_keras, 
  x                = as.matrix(x_train_tbl), 
  y                = y_train_vec,
  batch_size       = 50, 
  epochs           = 35,
  validation_split = 0.10
)
plot(history)
## `geom_smooth()` using formula 'y ~ x'

model_keras %>% evaluate(as.matrix(x_train_tbl), y_train_vec)
##          loss      accuracy 
## -1.374479e+07  2.986646e-01