20240822-Clase_12_Bank

REGRESIÓN LOGÍSTICA SIMPLE

# Instalar paquetes
##install.packages("ISLR")
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(ISLR) 
library(dplyr)
library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

#library(read)

## ISLR contiene datasets del libro "An Introduction to Statistical 
## Learning with Applications in R"

Leer el archivo y crear el data frame

bank_full <- read.csv("Bank_Full_comas.csv")

#class(bank_full)

#Mostrar la estructura del dataframe

str(bank_full)

## 'data.frame':    45211 obs. of  17 variables:
##  $ age      : int  58 44 33 47 33 35 28 42 58 43 ...
##  $ job      : chr  "management" "technician" "entrepreneur" "blue-collar" ...
##  $ marital  : chr  "married" "single" "married" "married" ...
##  $ education: chr  "tertiary" "secondary" "secondary" "unknown" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  2143 29 2 1506 1 231 447 2 121 593 ...
##  $ housing  : chr  "yes" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "no" "yes" "no" ...
##  $ contact  : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ day      : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ month    : chr  "may" "may" "may" "may" ...
##  $ duration : int  261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ y        : chr  "no" "no" "no" "no" ...

Para cambiar la variable tipo chr a factor

## balance como variable independiente (Eje X), numérico entero
## "y" como variable dependiente (Eje Y) es la probabilidad de que abran un CDT

bank_full$y <- as.factor(bank_full$y)

#Si se quieren cambiar todos los tipos chr a factor
##bank_full<- bank_full %>% mutate_if(is.character,as.factor)

#Verificar la estructura modificada a factor
##str(bank_full)

Se recodifican los niveles con valores NO (0), YES (1)

# En el select se indican las columnas
##categórica (dependiente) y numérica (independiente)
###Dependiente (Eje X) es la variable y
###Independiente (Eje Y) es la variable balance
bank_full <- bank_full %>%
         select(y, balance) %>%
         mutate(y = recode(y,
                                 "no"  = 0,
                                 "yes" = 1))

head(bank_full)

##   y balance
## 1 0    2143
## 2 0      29
## 3 0       2
## 4 0    1506
## 5 0       1
## 6 0     231

Ajuste de un modelo logístico

##categórica (dependiente) y numérica (independiente)
###Dependiente (Eje X) es la variable y
###Independiente (Eje Y) es la variable balance

modelo_logistico3 <- glm(y ~ balance, 
                        data = bank_full, 
                        family = "binomial")

#Variable de respuesta (dependiente) = y
#Variable predictora (independiente) = balance
#binomial porque se tiene una variable de respuesta con dos opciones

#names(modelo_logistico)

Representación gráfica del modelo

ggplot(data = bank_full, aes(x = balance, y = y)) +
  geom_point(aes(color = as.factor(y)), shape = 1) + 
  stat_function(fun = function(x){predict(modelo_logistico3,
                                          newdata = data.frame(balance = x),
                                          type = "response")}) +
  theme_bw() +
  labs(title = "Regresión logística simple modelo #3",
       y = "Probabilidad y") +
  theme(legend.position = "none")

Con geom_smooth se puede obtener el gráfico directamente

b<-ggplot(data = bank_full, aes(x = balance, y = y)) +
  geom_point(aes(color = as.factor(y)), shape = 1) + 
  geom_smooth(method = "glm",
              method.args = list(family = "binomial"),
              color = "gray20",
              se = FALSE) +
  theme_bw() +
  theme(legend.position = "none")
#ggplotly(b)

Identificar la ecuación

#ln(p/1-p)=β0+β1X
summary3 <- summary(modelo_logistico3)
#summary

#Intercepto3 <- summary3$coefficients[1,1];Intercepto
#Pendiente3 <- summary3$coefficients[2,1];Pendiente

20240822-Clase_12_Bank_Full

Andrea Zabala Quimbayo

2024-08-22

Título