REGRESIÓN LOGÍSTICA SIMPLE
# Instalar paquetes
##install.packages("ISLR")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ISLR)
library(dplyr)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
#library(read)
## ISLR contiene datasets del libro "An Introduction to Statistical
## Learning with Applications in R"
Leer el archivo y crear el data frame
bank_full <- read.csv("Bank_Full_comas.csv")
#class(bank_full)
#Mostrar la estructura del dataframe
str(bank_full)
## 'data.frame': 45211 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : chr "management" "technician" "entrepreneur" "blue-collar" ...
## $ marital : chr "married" "single" "married" "married" ...
## $ education: chr "tertiary" "secondary" "secondary" "unknown" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : chr "yes" "yes" "yes" "yes" ...
## $ loan : chr "no" "no" "yes" "no" ...
## $ contact : chr "unknown" "unknown" "unknown" "unknown" ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : chr "may" "may" "may" "may" ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "unknown" "unknown" "unknown" "unknown" ...
## $ y : chr "no" "no" "no" "no" ...
Para cambiar la variable tipo chr a factor
## balance como variable independiente (Eje X), numérico entero
## "y" como variable dependiente (Eje Y) es la probabilidad de que abran un CDT
bank_full$y <- as.factor(bank_full$y)
#Si se quieren cambiar todos los tipos chr a factor
##bank_full<- bank_full %>% mutate_if(is.character,as.factor)
#Verificar la estructura modificada a factor
##str(bank_full)
Se recodifican los niveles con valores NO (0), YES (1)
# En el select se indican las columnas
##categórica (dependiente) y numérica (independiente)
###Dependiente (Eje X) es la variable y
###Independiente (Eje Y) es la variable balance
bank_full <- bank_full %>%
select(y, balance) %>%
mutate(y = recode(y,
"no" = 0,
"yes" = 1))
head(bank_full)
## y balance
## 1 0 2143
## 2 0 29
## 3 0 2
## 4 0 1506
## 5 0 1
## 6 0 231
Ajuste de un modelo logístico
##categórica (dependiente) y numérica (independiente)
###Dependiente (Eje X) es la variable y
###Independiente (Eje Y) es la variable balance
modelo_logistico3 <- glm(y ~ balance,
data = bank_full,
family = "binomial")
#Variable de respuesta (dependiente) = y
#Variable predictora (independiente) = balance
#binomial porque se tiene una variable de respuesta con dos opciones
#names(modelo_logistico)
Representación gráfica del modelo
ggplot(data = bank_full, aes(x = balance, y = y)) +
geom_point(aes(color = as.factor(y)), shape = 1) +
stat_function(fun = function(x){predict(modelo_logistico3,
newdata = data.frame(balance = x),
type = "response")}) +
theme_bw() +
labs(title = "Regresión logística simple modelo #3",
y = "Probabilidad y") +
theme(legend.position = "none")
Con geom_smooth se puede obtener el gráfico directamente
b<-ggplot(data = bank_full, aes(x = balance, y = y)) +
geom_point(aes(color = as.factor(y)), shape = 1) +
geom_smooth(method = "glm",
method.args = list(family = "binomial"),
color = "gray20",
se = FALSE) +
theme_bw() +
theme(legend.position = "none")
#ggplotly(b)
Identificar la ecuación
#ln(p/1-p)=β0+β1X
summary3 <- summary(modelo_logistico3)
#summary
#Intercepto3 <- summary3$coefficients[1,1];Intercepto
#Pendiente3 <- summary3$coefficients[2,1];Pendiente