Teoría

La Regresión logística es un modelo estadístico de classificación binaria, que estima la probabilidad de que ocurra un evento (valor 1) frentea que no ocurra (valor 0), en función de variables independientes.

Instalar paquetes y llamar librerías

#install.packages("titanic")
library(titanic)
#install.packages("caret")
library(caret)
#install.packages("tidyverse")
library(tidyverse)

Importar base de datos

df <- titanic_train

Entender base de datos

summary(df)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 
str(df)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...
df<- df[,c("Survived","Pclass", "Sex","Age")]
df <- na.omit(df)
df$Sex <- as.factor(df$Sex)
df$Pclass <- as.factor(df$Pclass)
df$Survived <- as.factor(df$Survived)

Crear el modelo

modelo <-glm(Survived ~ ., data=df, family=binomial)
summary(modelo)
## 
## Call:
## glm(formula = Survived ~ ., family = binomial, data = df)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.777013   0.401123   9.416  < 2e-16 ***
## Pclass2     -1.309799   0.278066  -4.710 2.47e-06 ***
## Pclass3     -2.580625   0.281442  -9.169  < 2e-16 ***
## Sexmale     -2.522781   0.207391 -12.164  < 2e-16 ***
## Age         -0.036985   0.007656  -4.831 1.36e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 964.52  on 713  degrees of freedom
## Residual deviance: 647.28  on 709  degrees of freedom
## AIC: 657.28
## 
## Number of Fisher Scoring iterations: 5

Probar el modelo

prueba <- data.frame(Pclass = as.factor(c(1, 3)), 
                     Sex = as.factor(c("female", "male")), 
                     Age = c(25, 40))

# Aquí aplicas la predicción sobre el nuevo conjunto de datos
probabilidad <- predict(modelo, newdata = prueba, type = "response")

# Combinas el dataframe original con la probabilidad de sobrevivir
cbind(prueba, Probabilidad_Sobrevive = probabilidad)
##   Pclass    Sex Age Probabilidad_Sobrevive
## 1      1 female  25             0.94544163
## 2      3   male  40             0.05701133
LS0tCnRpdGxlOiAiUmVncmVzacOzbiBMb2fDrXN0aWNhIgphdXRob3I6ICJSaWdvYmVydG8iCmRhdGU6ICIyMDI1LTA4LTI5IgpvdXRwdXQ6IAogIGh0bWxfZG9jdW1lbnQ6CiAgICB0b2M6IFRSVUUKICAgIHRvY19mbG9hdDogVFJVRQogICAgY29kZV9kb3dubG9hZDogVFJVRQogICAgdGhlbWU6IGJvb3RzdHJhcAotLS0KCiFbXShodHRwczovL21lZGlhMS50ZW5vci5jb20vbS9xWUJiRWpBNl9jSUFBQUFkL2EtbmlnaHQtdG8tcmVtZW1iZXItbW92aWUtYS1uaWdodC10by1yZW1lbWJlci5naWYpCgojIDxzcGFuIHN0eWxlPSJjb2xvcjpibHVlIj4gVGVvcsOtYSA8L3NwYW4+CkxhICoqUmVncmVzacOzbiBsb2fDrXN0aWNhKiogZXMgdW4gbW9kZWxvIGVzdGFkw61zdGljbyBkZSBjbGFzc2lmaWNhY2nDs24gYmluYXJpYSwgcXVlIGVzdGltYSBsYSBwcm9iYWJpbGlkYWQgZGUgcXVlIG9jdXJyYSB1biBldmVudG8gKHZhbG9yIDEpIGZyZW50ZWEgcXVlIG5vIG9jdXJyYSAodmFsb3IgMCksIGVuIGZ1bmNpw7NuIGRlIHZhcmlhYmxlcyBpbmRlcGVuZGllbnRlcy4gCgojIDxzcGFuIHN0eWxlPSJjb2xvcjpibHVlIj4gSW5zdGFsYXIgcGFxdWV0ZXMgeSBsbGFtYXIgbGlicmVyw61hcyA8L3NwYW4+CmBgYHtyIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CiNpbnN0YWxsLnBhY2thZ2VzKCJ0aXRhbmljIikKbGlicmFyeSh0aXRhbmljKQojaW5zdGFsbC5wYWNrYWdlcygiY2FyZXQiKQpsaWJyYXJ5KGNhcmV0KQojaW5zdGFsbC5wYWNrYWdlcygidGlkeXZlcnNlIikKbGlicmFyeSh0aWR5dmVyc2UpCgpgYGAKIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZSI+IEltcG9ydGFyIGJhc2UgZGUgZGF0b3MgPC9zcGFuPgpgYGB7cn0KZGYgPC0gdGl0YW5pY190cmFpbgpgYGAKCgojIDxzcGFuIHN0eWxlPSJjb2xvcjpibHVlIj4gRW50ZW5kZXIgYmFzZSBkZSBkYXRvcyA8L3NwYW4+CgpgYGB7cn0Kc3VtbWFyeShkZikKc3RyKGRmKQpkZjwtIGRmWyxjKCJTdXJ2aXZlZCIsIlBjbGFzcyIsICJTZXgiLCJBZ2UiKV0KZGYgPC0gbmEub21pdChkZikKZGYkU2V4IDwtIGFzLmZhY3RvcihkZiRTZXgpCmRmJFBjbGFzcyA8LSBhcy5mYWN0b3IoZGYkUGNsYXNzKQpkZiRTdXJ2aXZlZCA8LSBhcy5mYWN0b3IoZGYkU3Vydml2ZWQpCmBgYAoKIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZSI+IENyZWFyIGVsIG1vZGVsbyA8L3NwYW4+CmBgYHtyfQptb2RlbG8gPC1nbG0oU3Vydml2ZWQgfiAuLCBkYXRhPWRmLCBmYW1pbHk9Ymlub21pYWwpCnN1bW1hcnkobW9kZWxvKQpgYGAKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOmJsdWUiPiBQcm9iYXIgZWwgbW9kZWxvIDwvc3Bhbj4KYGBge3J9CnBydWViYSA8LSBkYXRhLmZyYW1lKFBjbGFzcyA9IGFzLmZhY3RvcihjKDEsIDMpKSwgCiAgICAgICAgICAgICAgICAgICAgIFNleCA9IGFzLmZhY3RvcihjKCJmZW1hbGUiLCAibWFsZSIpKSwgCiAgICAgICAgICAgICAgICAgICAgIEFnZSA9IGMoMjUsIDQwKSkKCiMgQXF1w60gYXBsaWNhcyBsYSBwcmVkaWNjacOzbiBzb2JyZSBlbCBudWV2byBjb25qdW50byBkZSBkYXRvcwpwcm9iYWJpbGlkYWQgPC0gcHJlZGljdChtb2RlbG8sIG5ld2RhdGEgPSBwcnVlYmEsIHR5cGUgPSAicmVzcG9uc2UiKQoKIyBDb21iaW5hcyBlbCBkYXRhZnJhbWUgb3JpZ2luYWwgY29uIGxhIHByb2JhYmlsaWRhZCBkZSBzb2JyZXZpdmlyCmNiaW5kKHBydWViYSwgUHJvYmFiaWxpZGFkX1NvYnJldml2ZSA9IHByb2JhYmlsaWRhZCkKCmBgYAoKCgoKCg==