Instalar paquetes y llamar librerías

La regresión logística es un modelo estadístico de clasificación binaria, que estima la probabilidad de que ocurra un evento (valor 1) frente a que no ocurra (valor 0), en función de variables independientes.

Instalar paquetes y llamar librerías

# install.packages(c("titanic","caret","tidyverse"))

suppressPackageStartupMessages({
  library(titanic)
  library(caret)
  library(tidyverse)
})

Crear la base de datos

df <- titanic_train

Entender la base de datos

summary(df)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 
str(df)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...
df <- df[, c("Survived", "Pclass", "Sex", "Age")]
df <- na.omit(df)
df$Survived <- as.factor(df$Survived)
df$Pclass   <- as.factor(df$Pclass)
df$Sex      <- as.factor(df$Sex)

Crear el modelo

modelo <- glm(Survived ~ ., data = df, family = binomial)
summary(modelo)
## 
## Call:
## glm(formula = Survived ~ ., family = binomial, data = df)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.777013   0.401123   9.416  < 2e-16 ***
## Pclass2     -1.309799   0.278066  -4.710 2.47e-06 ***
## Pclass3     -2.580625   0.281442  -9.169  < 2e-16 ***
## Sexmale     -2.522781   0.207391 -12.164  < 2e-16 ***
## Age         -0.036985   0.007656  -4.831 1.36e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 964.52  on 713  degrees of freedom
## Residual deviance: 647.28  on 709  degrees of freedom
## AIC: 657.28
## 
## Number of Fisher Scoring iterations: 5

Probar el modelo

prueba <-  data.frame(Pclass=as.factor(c(1,3)), Sex=as.factor(c("female", "male")), Age=c(25,40))

probabilidad <- predict(modelo, newdata=prueba, type="response")

cbind(prueba, Probabilidad_Sobrevivir= probabilidad)
##   Pclass    Sex Age Probabilidad_Sobrevivir
## 1      1 female  25              0.94544163
## 2      3   male  40              0.05701133
LS0tCnRpdGxlOiAiUmVncmVzacOzbiBMb2fDrXN0aWNhIgphdXRob3I6ICJBbm5hIER1csOhbiBBMDEyODU2NzQiCmRhdGU6ICdgciBTeXMuRGF0ZSgpYCcKb3V0cHV0OgogICAgICAgIGh0bWxfZG9jdW1lbnQ6CiAgICAgICAgICAgICAgICB0b2M6IFRSVUUKICAgICAgICAgICAgICAgIHRvY19mbG9hdDogVFJVRQogICAgICAgICAgICAgICAgY29kZV9kb3dubG9hZDogVFJVRQogICAgICAgICAgICAgICAgdGhlbWU6IGJvb3RzdHJhcAotLS0KCiFbXShodHRwczovL21lZGlhMS50ZW5vci5jb20vbS9xWUJiRWpBNl9jSUFBQUFkL2EtbmlnaHQtdG8tcmVtZW1iZXItbW92aWUtYS1uaWdodC10by1yZW1lbWJlci5naWYpCgojIDxzcGFuIHN0eWxlPSJjb2xvcjojQjNCRUZGIj4gSW5zdGFsYXIgcGFxdWV0ZXMgeSBsbGFtYXIgbGlicmVyw61hcyA8L3NwYW4+CkxhICoqcmVncmVzacOzbiBsb2fDrXN0aWNhKiogZXMgdW4gbW9kZWxvIGVzdGFkw61zdGljbyBkZSBjbGFzaWZpY2FjacOzbiBiaW5hcmlhLCBxdWUgZXN0aW1hIGxhIHByb2JhYmlsaWRhZCBkZSBxdWUgb2N1cnJhIHVuIGV2ZW50byAodmFsb3IgMSkgZnJlbnRlIGEgcXVlIG5vIG9jdXJyYSAodmFsb3IgMCksIGVuIGZ1bmNpw7NuIGRlIHZhcmlhYmxlcyBpbmRlcGVuZGllbnRlcy4KCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiNCM0JFRkYiPiBJbnN0YWxhciBwYXF1ZXRlcyB5IGxsYW1hciBsaWJyZXLDrWFzIDwvc3Bhbj4KYGBge3J9CiMgaW5zdGFsbC5wYWNrYWdlcyhjKCJ0aXRhbmljIiwiY2FyZXQiLCJ0aWR5dmVyc2UiKSkKCnN1cHByZXNzUGFja2FnZVN0YXJ0dXBNZXNzYWdlcyh7CiAgbGlicmFyeSh0aXRhbmljKQogIGxpYnJhcnkoY2FyZXQpCiAgbGlicmFyeSh0aWR5dmVyc2UpCn0pCmBgYAoKIyA8c3BhbiBzdHlsZT0iY29sb3I6I0IzQkVGRiI+IENyZWFyIGxhIGJhc2UgZGUgZGF0b3MgPC9zcGFuPgpgYGB7cn0KZGYgPC0gdGl0YW5pY190cmFpbgpgYGAKCiMgPHNwYW4gc3R5bGU9ImNvbG9yOiNCM0JFRkYiPiBFbnRlbmRlciBsYSBiYXNlIGRlIGRhdG9zIDwvc3Bhbj4KYGBge3J9CnN1bW1hcnkoZGYpCnN0cihkZikKZGYgPC0gZGZbLCBjKCJTdXJ2aXZlZCIsICJQY2xhc3MiLCAiU2V4IiwgIkFnZSIpXQpkZiA8LSBuYS5vbWl0KGRmKQpkZiRTdXJ2aXZlZCA8LSBhcy5mYWN0b3IoZGYkU3Vydml2ZWQpCmRmJFBjbGFzcyAgIDwtIGFzLmZhY3RvcihkZiRQY2xhc3MpCmRmJFNleCAgICAgIDwtIGFzLmZhY3RvcihkZiRTZXgpCmBgYAoKIyA8c3BhbiBzdHlsZT0iY29sb3I6I0IzQkVGRiI+IENyZWFyIGVsIG1vZGVsbyA8L3NwYW4+CmBgYHtyfQptb2RlbG8gPC0gZ2xtKFN1cnZpdmVkIH4gLiwgZGF0YSA9IGRmLCBmYW1pbHkgPSBiaW5vbWlhbCkKc3VtbWFyeShtb2RlbG8pCmBgYAoKIyA8c3BhbiBzdHlsZT0iY29sb3I6I0IzQkVGRiI+IFByb2JhciBlbCBtb2RlbG8gPC9zcGFuPgpgYGB7cn0KcHJ1ZWJhIDwtICBkYXRhLmZyYW1lKFBjbGFzcz1hcy5mYWN0b3IoYygxLDMpKSwgU2V4PWFzLmZhY3RvcihjKCJmZW1hbGUiLCAibWFsZSIpKSwgQWdlPWMoMjUsNDApKQoKcHJvYmFiaWxpZGFkIDwtIHByZWRpY3QobW9kZWxvLCBuZXdkYXRhPXBydWViYSwgdHlwZT0icmVzcG9uc2UiKQoKY2JpbmQocHJ1ZWJhLCBQcm9iYWJpbGlkYWRfU29icmV2aXZpcj0gcHJvYmFiaWxpZGFkKQpgYGAKCg==