la Regresión logística es un modelo estadístico de clasificacion binaria, que estima la probabilidad de que ocurra un evento (valor 1) frente a que no ocurra (valor 0), en funcion de variables independientes
#install.packages("titanic")
library(titanic)
library(caret)
## Cargando paquete requerido: ggplot2
## Cargando paquete requerido: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- titanic_train
summary(df)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
str(df)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
df <- df[, c("Survived", "Pclass", "Sex", "Age")]
df <- na.omit(df)
df$Survived <- as.factor(df$Survived)
df$Survived <- as.factor(df$Pclass)
df$Survived <- as.factor(df$Sex)
modelo <- glm(Survived ~ ., data=df, family=binomial)
## Warning: glm.fit: algorithm did not converge
summary(modelo)
##
## Call:
## glm(formula = Survived ~ ., family = binomial, data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.657e+01 5.797e+04 0.000 1.000
## Pclass -7.052e-11 1.749e+04 0.000 1.000
## Sexmale 5.313e+01 2.840e+04 0.002 0.999
## Age -3.501e-12 1.001e+03 0.000 1.000
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9.3754e+02 on 713 degrees of freedom
## Residual deviance: 4.1423e-09 on 710 degrees of freedom
## AIC: 8
##
## Number of Fisher Scoring iterations: 25
prueba <- data.frame(
Pclass = c(1,3),
Sex = factor(c("female","male")),
Age = c(25,40)
)
probabilidad <- predict(modelo, newdata = prueba, type = "response")
cbind(prueba, Probabilidad_Sobrevive = probabilidad)
## Pclass Sex Age Probabilidad_Sobrevive
## 1 1 female 25 2.900701e-12
## 2 3 male 40 1.000000e+00