La Regresión logística es un modelo estadístico de classificación binaria, que estima la probabilidad de que ocurra un evento (valor 1) frentea que no ocurra (valor 0), en función de variables independientes.
#install.packages("titanic")
library(titanic)
#install.packages("caret")
library(caret)
#install.packages("tidyverse")
library(tidyverse)
df <- titanic_train
summary(df)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
str(df)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
df<- df[,c("Survived","Pclass", "Sex","Age")]
df <- na.omit(df)
df$Sex <- as.factor(df$Sex)
df$Pclass <- as.factor(df$Pclass)
df$Survived <- as.factor(df$Survived)
modelo <-glm(Survived ~ ., data=df, family=binomial)
summary(modelo)
##
## Call:
## glm(formula = Survived ~ ., family = binomial, data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.777013 0.401123 9.416 < 2e-16 ***
## Pclass2 -1.309799 0.278066 -4.710 2.47e-06 ***
## Pclass3 -2.580625 0.281442 -9.169 < 2e-16 ***
## Sexmale -2.522781 0.207391 -12.164 < 2e-16 ***
## Age -0.036985 0.007656 -4.831 1.36e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 964.52 on 713 degrees of freedom
## Residual deviance: 647.28 on 709 degrees of freedom
## AIC: 657.28
##
## Number of Fisher Scoring iterations: 5
prueba <- data.frame(Pclass = as.factor(c(1, 3)),
Sex = as.factor(c("female", "male")),
Age = c(25, 40))
# Aquí aplicas la predicción sobre el nuevo conjunto de datos
probabilidad <- predict(modelo, newdata = prueba, type = "response")
# Combinas el dataframe original con la probabilidad de sobrevivir
cbind(prueba, Probabilidad_Sobrevive = probabilidad)
## Pclass Sex Age Probabilidad_Sobrevive
## 1 1 female 25 0.94544163
## 2 3 male 40 0.05701133