# install.packages("reticulate")
library(reticulate)
use_python("C:/Users/Cesar Corregidor/anaconda3/python.exe")

INSTRUCCIONES

data_r <- read.csv("D:/Cesar Corregidor/Documents/especializacion/ANALISIS_DE_REGRESION/Regresion-Aplicada-main/Talleres/Taller3/train.csv", sep = ",")
data <- read.csv("D:/Cesar Corregidor/Documents/especializacion/ANALISIS_DE_REGRESION/Regresion-Aplicada-main/Talleres/Taller3/train.csv", sep = ",")

library(DT)
DT::datatable(data)
str(data)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...
dim(data[1])
## [1] 891   1
hist(data_r$Age)

data_py = r_to_py(data_r)
data_py$head()
##    PassengerId  Survived  Pclass  ...     Fare Cabin  Embarked
## 0            1         0       3  ...   7.2500               S
## 1            2         1       1  ...  71.2833   C85         C
## 2            3         1       3  ...   7.9250               S
## 3            4         1       1  ...  53.1000  C123         S
## 4            5         0       3  ...   8.0500               S
## 
## [5 rows x 12 columns]
data_py$isnull()$sum()
## PassengerId      0
## Survived         0
## Pclass           0
## Name             0
## Sex              0
## Age            177
## SibSp            0
## Parch            0
## Ticket           0
## Fare             0
## Cabin            0
## Embarked         0
## dtype: int64
data_r = py_to_r(data_py)
library(tidyr)
data_r = data_r %>% drop_na()
dim(data_r)[1]
## [1] 714
mediana = median(data_r$Age)
mediana
## [1] 28
data[is.na(data)] <- mediana
hist(data$Age)

Considere la base de datos Titanic seleccionada. A partir de la misma resuelva:

  1. Realice una corta presentación del conjunto de datos asegurándose de entender cada variable.
Variable descripción
passengerId Id del pasajero
survival: Supervivencia. 0 = No, 1 = Si
pclass: Clase del tiquete 1 = 1st, 2 = 2nd, 3 = 3rd
name: Nombre del pasajero
sex: Género
Age: Edad
sibsp: Números de hermanos / cónyuges a bordo del Titanic
parch: Número de padres / Niños a bordo del Titanic
ticket: Número de ticket
fare: Tarifa de pasajero
cabin: Número de la cabina
embarked: Puerto de embarque C = Cherbourg, Q = Queenstown, S = Southampton
  1. Presente los principales resultados de un Análisis Descriptivo y Exploratorio de los datos. Haga énfasis en la relación que pueda existir entre las variables.
library(tidyverse) 
library(MASS)
library(car)
library(e1071)
library(caret)
library(cowplot)
library(caTools)
library(pROC)
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.1.2
glimpse(data)
## Rows: 891
## Columns: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
## $ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1~
## $ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3~
## $ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl~
## $ Sex         <chr> "male", "female", "female", "female", "male", "male", "mal~
## $ Age         <dbl> 22, 38, 26, 35, 35, 28, 54, 2, 27, 14, 4, 58, 20, 39, 14, ~
## $ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0~
## $ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0~
## $ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37~
## $ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,~
## $ Cabin       <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C~
## $ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"~

Las variables Survived, Pclass, Sex, SibSp, Parch y Embarked las convertimos en variables categóricas

data$Survived=factor(data$Survived)
data$Sex=factor(data$Sex)
data$Pclass=factor(data$Pclass)
data$SibSp=factor(data$SibSp)
data$Parch=factor(data$Parch)
data$Embarked=factor(data$Embarked)
data$Ticket=factor(data$Ticket)
data$Cabin=factor(data$Cabin)
#data %>% filter(Embarked=="")
theme1 <- theme_bw()+
theme(axis.text.x = element_text(angle = 0, hjust = 1, vjust = 0.5),legend.position="none")
theme2 <- theme_bw()+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),legend.position="none")
options(repr.plot.width = 8, repr.plot.height = 5)
plot_grid(ggplot(data, aes(x=Pclass,fill=Survived))+ geom_bar()+theme1,
          ggplot(data, aes(x=Sex,fill=Survived))+ geom_bar()+theme1,
          ggplot(data, aes(x=SibSp,fill=Survived))+ geom_bar()+theme1,
          ggplot(data, aes(x=Parch,fill=Survived))+ geom_bar()+theme1,
          ggplot(data, aes(x=Embarked,fill=Survived))+ geom_bar()+theme1+
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)), align = "h")

- Pclass: Existe una proporción homogénea para el de tipo 2; para el ticket de clase 1 se salvaron más de la mitad de esa población en contraste con los de clase 3 donde la proporción de los que no se salvaron es mayor a los que si lo hicieron.

Realicemos el análisis para las variables cuantitativas

options(repr.plot.width =10, repr.plot.height = 5)
ggplot(data, aes(y= Fare, x = "", fill = Survived)) + 
geom_boxplot()+ 
theme_bw()+
xlab(" ")

Según el boxplot para las personas que no sobrevivieron si afectó bastante la tarifa del pasajero puesto que la mediana está muy cerca del percentil 25 indicando que hay una fuerte concentración de datos en este intervalo. Si bien el comportamiento para las personas que si sobrevivieron tiende a ser el mismo la distribución es un poco más homogenea. También podemos decir que existieron personas que no sobrevivieron a pesar de que la tarija de pasajero era alta

options(repr.plot.width =10, repr.plot.height = 5)
ggplot(data, aes(y= Age, x = "", fill = Survived)) + 
geom_boxplot()+ 
theme_bw()+
xlab(" ")

Podemos decir que hubo personas de avanzada edad y niños que no sobrevivieron. La mediana para las personas que se salvaron y las que no son cercanas y la media tiende a ser la mediana.

options(repr.plot.width =6, repr.plot.height = 4)
data_corr <- round(cor(data[,c("Age", "Fare")]), 1)

ggcorrplot(data_corr,  title = "Correlation")+theme(plot.title = element_text(hjust = 0.5))

No existe correlación entre las variables cuantitativas(Fare y Age)

num_columns <- c("Age", "Fare")
data[num_columns] <- sapply(data[num_columns], as.numeric)

data_int <- data[,c("Age", "Fare")]
data_int <- data.frame(scale(data_int))
data_cat <- data[,-c(1,4,6,9,10,11)]
names(data)
##  [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"        
##  [6] "Age"         "SibSp"       "Parch"       "Ticket"      "Fare"       
## [11] "Cabin"       "Embarked"
dummy<- data.frame(sapply(data_cat,function(x) data.frame(model.matrix(~x-1,data =data_cat))[,-1]))
head(dummy)
##   Survived Pclass.x2 Pclass.x3 Sex SibSp.x1 SibSp.x2 SibSp.x3 SibSp.x4 SibSp.x5
## 1        0         0         1   1        1        0        0        0        0
## 2        1         0         0   0        1        0        0        0        0
## 3        1         0         1   0        0        0        0        0        0
## 4        1         0         0   0        1        0        0        0        0
## 5        0         0         1   1        0        0        0        0        0
## 6        0         0         1   1        0        0        0        0        0
##   SibSp.x8 Parch.x1 Parch.x2 Parch.x3 Parch.x4 Parch.x5 Parch.x6 Embarked.xC
## 1        0        0        0        0        0        0        0           0
## 2        0        0        0        0        0        0        0           1
## 3        0        0        0        0        0        0        0           0
## 4        0        0        0        0        0        0        0           0
## 5        0        0        0        0        0        0        0           0
## 6        0        0        0        0        0        0        0           0
##   Embarked.xQ Embarked.xS
## 1           0           1
## 2           0           0
## 3           0           1
## 4           0           1
## 5           0           1
## 6           1           0
data_final <- cbind(data_int,dummy)
head(data_final)
##          Age       Fare Survived Pclass.x2 Pclass.x3 Sex SibSp.x1 SibSp.x2
## 1 -0.5654189 -0.5021631        0         0         1   1        1        0
## 2  0.6634884  0.7864036        1         0         0   0        1        0
## 3 -0.2581921 -0.4885799        1         0         1   0        0        0
## 4  0.4330683  0.4204941        1         0         0   0        1        0
## 5  0.4330683 -0.4860644        0         0         1   1        0        0
## 6 -0.1045787 -0.4778481        0         0         1   1        0        0
##   SibSp.x3 SibSp.x4 SibSp.x5 SibSp.x8 Parch.x1 Parch.x2 Parch.x3 Parch.x4
## 1        0        0        0        0        0        0        0        0
## 2        0        0        0        0        0        0        0        0
## 3        0        0        0        0        0        0        0        0
## 4        0        0        0        0        0        0        0        0
## 5        0        0        0        0        0        0        0        0
## 6        0        0        0        0        0        0        0        0
##   Parch.x5 Parch.x6 Embarked.xC Embarked.xQ Embarked.xS
## 1        0        0           0           0           1
## 2        0        0           1           0           0
## 3        0        0           0           0           1
## 4        0        0           0           0           1
## 5        0        0           0           0           1
## 6        0        0           0           1           0
set.seed(123)
indices = sample.split(data_final$Survived, SplitRatio = 0.7)
train = data_final[indices,]
test = data_final[!(indices),]

modelo saturado

modelo_1 = glm(Survived ~ ., data = train, family = "binomial")
summary(modelo_1)
## 
## Call:
## glm(formula = Survived ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4785  -0.5893  -0.4082   0.5815   2.6225  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   18.25576 2759.33446   0.007  0.99472    
## Age           -0.40331    0.13639  -2.957  0.00311 ** 
## Fare          -0.01066    0.14598  -0.073  0.94178    
## Pclass.x2     -0.86875    0.35991  -2.414  0.01579 *  
## Pclass.x3     -1.86853    0.36357  -5.139 2.76e-07 ***
## Sex           -2.72832    0.24210 -11.269  < 2e-16 ***
## SibSp.x1       0.13173    0.27151   0.485  0.62756    
## SibSp.x2      -0.09308    0.60630  -0.154  0.87799    
## SibSp.x3      -1.33359    0.84845  -1.572  0.11600    
## SibSp.x4      -1.69611    0.89442  -1.896  0.05792 .  
## SibSp.x5     -16.74814 1727.51516  -0.010  0.99226    
## SibSp.x8     -16.79883 1340.16276  -0.013  0.99000    
## Parch.x1       0.47579    0.34906   1.363  0.17285    
## Parch.x2      -0.09819    0.43401  -0.226  0.82101    
## Parch.x3       0.19690    1.09001   0.181  0.85665    
## Parch.x4     -16.42203 2010.04393  -0.008  0.99348    
## Parch.x5     -16.93578 1965.65382  -0.009  0.99313    
## Parch.x6     -17.57204 3956.18035  -0.004  0.99646    
## Embarked.xC  -15.37401 2759.33447  -0.006  0.99555    
## Embarked.xQ  -15.64313 2759.33449  -0.006  0.99548    
## Embarked.xS  -16.08735 2759.33447  -0.006  0.99535    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 829.60  on 622  degrees of freedom
## Residual deviance: 527.68  on 602  degrees of freedom
## AIC: 569.68
## 
## Number of Fisher Scoring iterations: 16
modelo_2 <- stepAIC(modelo_1, direction="both")
## Start:  AIC=569.68
## Survived ~ Age + Fare + Pclass.x2 + Pclass.x3 + Sex + SibSp.x1 + 
##     SibSp.x2 + SibSp.x3 + SibSp.x4 + SibSp.x5 + SibSp.x8 + Parch.x1 + 
##     Parch.x2 + Parch.x3 + Parch.x4 + Parch.x5 + Parch.x6 + Embarked.xC + 
##     Embarked.xQ + Embarked.xS
## 
##               Df Deviance    AIC
## - Fare         1   527.69 567.69
## - SibSp.x2     1   527.71 567.71
## - Parch.x3     1   527.72 567.72
## - Parch.x2     1   527.74 567.74
## - SibSp.x1     1   527.92 567.92
## - Embarked.xC  1   528.11 568.11
## - Embarked.xQ  1   528.22 568.22
## - Embarked.xS  1   528.50 568.50
## - Parch.x6     1   529.04 569.04
## - Parch.x4     1   529.47 569.47
## - Parch.x1     1   529.55 569.55
## <none>             527.68 569.68
## - SibSp.x3     1   530.23 570.23
## - SibSp.x5     1   530.46 570.46
## - Parch.x5     1   530.58 570.58
## - SibSp.x4     1   532.04 572.04
## - SibSp.x8     1   532.45 572.45
## - Pclass.x2    1   533.54 573.54
## - Age          1   536.89 576.89
## - Pclass.x3    1   553.98 593.98
## - Sex          1   685.96 725.96
## 
## Step:  AIC=567.69
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x1 + SibSp.x2 + 
##     SibSp.x3 + SibSp.x4 + SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x2 + 
##     Parch.x3 + Parch.x4 + Parch.x5 + Parch.x6 + Embarked.xC + 
##     Embarked.xQ + Embarked.xS
## 
##               Df Deviance    AIC
## - SibSp.x2     1   527.72 565.72
## - Parch.x3     1   527.72 565.72
## - Parch.x2     1   527.75 565.75
## - SibSp.x1     1   527.92 565.92
## - Embarked.xC  1   528.12 566.12
## - Embarked.xQ  1   528.23 566.23
## - Embarked.xS  1   528.50 566.50
## - Parch.x6     1   529.06 567.06
## - Parch.x4     1   529.51 567.51
## - Parch.x1     1   529.56 567.56
## <none>             527.69 567.69
## - SibSp.x3     1   530.32 568.32
## - SibSp.x5     1   530.47 568.47
## - Parch.x5     1   530.61 568.61
## + Fare         1   527.68 569.68
## - SibSp.x4     1   532.05 570.05
## - SibSp.x8     1   532.52 570.52
## - Pclass.x2    1   534.84 572.84
## - Age          1   536.91 574.91
## - Pclass.x3    1   566.17 604.17
## - Sex          1   686.14 724.14
## 
## Step:  AIC=565.72
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x1 + SibSp.x3 + 
##     SibSp.x4 + SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x2 + Parch.x3 + 
##     Parch.x4 + Parch.x5 + Parch.x6 + Embarked.xC + Embarked.xQ + 
##     Embarked.xS
## 
##               Df Deviance    AIC
## - Parch.x3     1   527.74 563.74
## - Parch.x2     1   527.78 563.78
## - SibSp.x1     1   527.98 563.98
## - Embarked.xC  1   528.14 564.14
## - Embarked.xQ  1   528.26 564.26
## - Embarked.xS  1   528.53 564.53
## - Parch.x6     1   529.09 565.09
## - Parch.x4     1   529.55 565.55
## - Parch.x1     1   529.56 565.56
## <none>             527.72 565.72
## - SibSp.x3     1   530.32 566.32
## - SibSp.x5     1   530.48 566.48
## - Parch.x5     1   530.64 566.64
## + SibSp.x2     1   527.69 567.69
## + Fare         1   527.71 567.71
## - SibSp.x4     1   532.05 568.05
## - SibSp.x8     1   532.52 568.52
## - Pclass.x2    1   534.89 570.89
## - Age          1   536.91 572.91
## - Pclass.x3    1   566.18 602.18
## - Sex          1   686.24 722.24
## 
## Step:  AIC=563.74
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x1 + SibSp.x3 + 
##     SibSp.x4 + SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x2 + Parch.x4 + 
##     Parch.x5 + Parch.x6 + Embarked.xC + Embarked.xQ + Embarked.xS
## 
##               Df Deviance    AIC
## - Parch.x2     1   527.82 561.82
## - SibSp.x1     1   528.03 562.03
## - Embarked.xC  1   528.17 562.17
## - Embarked.xQ  1   528.28 562.28
## - Embarked.xS  1   528.56 562.56
## - Parch.x6     1   529.13 563.13
## - Parch.x1     1   529.56 563.56
## - Parch.x4     1   529.59 563.59
## <none>             527.74 563.74
## - SibSp.x3     1   530.35 564.35
## - SibSp.x5     1   530.50 564.50
## - Parch.x5     1   530.69 564.69
## + Parch.x3     1   527.72 565.72
## + SibSp.x2     1   527.72 565.72
## + Fare         1   527.74 565.74
## - SibSp.x4     1   532.06 566.06
## - SibSp.x8     1   532.55 566.55
## - Pclass.x2    1   534.89 568.89
## - Age          1   536.91 570.91
## - Pclass.x3    1   566.31 600.31
## - Sex          1   688.25 722.25
## 
## Step:  AIC=561.82
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x1 + SibSp.x3 + 
##     SibSp.x4 + SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x4 + Parch.x5 + 
##     Parch.x6 + Embarked.xC + Embarked.xQ + Embarked.xS
## 
##               Df Deviance    AIC
## - SibSp.x1     1   528.09 560.09
## - Embarked.xC  1   528.25 560.25
## - Embarked.xQ  1   528.35 560.35
## - Embarked.xS  1   528.63 560.63
## - Parch.x6     1   529.19 561.19
## - Parch.x4     1   529.66 561.66
## <none>             527.82 561.82
## - Parch.x1     1   529.90 561.90
## - SibSp.x3     1   530.63 562.63
## - Parch.x5     1   530.74 562.74
## - SibSp.x5     1   530.89 562.89
## + Parch.x2     1   527.74 563.74
## + Parch.x3     1   527.78 563.78
## + SibSp.x2     1   527.79 563.79
## + Fare         1   527.80 563.80
## - SibSp.x4     1   532.96 564.96
## - SibSp.x8     1   533.40 565.40
## - Pclass.x2    1   534.92 566.92
## - Age          1   537.04 569.04
## - Pclass.x3    1   566.45 598.45
## - Sex          1   692.92 724.92
## 
## Step:  AIC=560.09
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x3 + SibSp.x4 + 
##     SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x4 + Parch.x5 + Parch.x6 + 
##     Embarked.xC + Embarked.xQ + Embarked.xS
## 
##               Df Deviance    AIC
## - Embarked.xC  1   528.50 558.50
## - Embarked.xQ  1   528.61 558.61
## - Embarked.xS  1   528.87 558.87
## - Parch.x6     1   529.38 559.38
## - Parch.x4     1   529.80 559.80
## <none>             528.09 560.09
## - Parch.x1     1   530.77 560.77
## - Parch.x5     1   530.94 560.94
## - SibSp.x3     1   531.15 561.15
## - SibSp.x5     1   531.23 561.23
## + SibSp.x1     1   527.82 561.82
## + Parch.x2     1   528.03 562.03
## + SibSp.x2     1   528.03 562.03
## + Parch.x3     1   528.04 562.04
## + Fare         1   528.08 562.08
## - SibSp.x4     1   533.56 563.56
## - SibSp.x8     1   533.82 563.82
## - Pclass.x2    1   535.25 565.25
## - Age          1   537.28 567.28
## - Pclass.x3    1   567.77 597.77
## - Sex          1   699.14 729.14
## 
## Step:  AIC=558.5
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x3 + SibSp.x4 + 
##     SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x4 + Parch.x5 + Parch.x6 + 
##     Embarked.xQ + Embarked.xS
## 
##               Df Deviance    AIC
## - Embarked.xQ  1   528.86 556.86
## - Parch.x6     1   529.80 557.80
## - Parch.x4     1   530.22 558.22
## <none>             528.50 558.50
## - Parch.x1     1   531.16 559.16
## - Parch.x5     1   531.36 559.36
## - SibSp.x3     1   531.56 559.56
## - SibSp.x5     1   531.63 559.63
## + Embarked.xC  1   528.09 560.09
## + SibSp.x1     1   528.25 560.25
## + Parch.x2     1   528.43 560.43
## + SibSp.x2     1   528.44 560.44
## + Parch.x3     1   528.45 560.45
## + Fare         1   528.49 560.49
## - SibSp.x4     1   533.93 561.93
## - SibSp.x8     1   534.24 562.24
## - Embarked.xS  1   534.47 562.47
## - Pclass.x2    1   535.72 563.72
## - Age          1   537.54 565.54
## - Pclass.x3    1   568.38 596.38
## - Sex          1   701.18 729.18
## 
## Step:  AIC=556.86
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x3 + SibSp.x4 + 
##     SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x4 + Parch.x5 + Parch.x6 + 
##     Embarked.xS
## 
##               Df Deviance    AIC
## - Parch.x6     1   530.13 556.13
## - Parch.x4     1   530.57 556.57
## <none>             528.86 556.86
## - Parch.x1     1   531.67 557.67
## - Parch.x5     1   531.67 557.67
## - SibSp.x3     1   531.89 557.89
## - SibSp.x5     1   531.95 557.95
## + Embarked.xQ  1   528.50 558.50
## + SibSp.x1     1   528.57 558.57
## + Embarked.xC  1   528.61 558.61
## + SibSp.x2     1   528.79 558.79
## + Parch.x3     1   528.80 558.80
## + Parch.x2     1   528.84 558.84
## + Fare         1   528.86 558.86
## - SibSp.x4     1   534.47 560.47
## - SibSp.x8     1   534.51 560.51
## - Embarked.xS  1   535.01 561.01
## - Pclass.x2    1   536.69 562.69
## - Age          1   538.09 564.09
## - Pclass.x3    1   574.06 600.06
## - Sex          1   701.50 727.50
## 
## Step:  AIC=556.13
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x3 + SibSp.x4 + 
##     SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x4 + Parch.x5 + Embarked.xS
## 
##               Df Deviance    AIC
## - Parch.x4     1   531.80 555.80
## <none>             530.13 556.13
## + Parch.x6     1   528.86 556.86
## - Parch.x5     1   532.88 556.88
## - Parch.x1     1   532.96 556.96
## - SibSp.x3     1   533.13 557.13
## - SibSp.x5     1   533.21 557.21
## + Embarked.xQ  1   529.80 557.80
## + Embarked.xC  1   529.90 557.90
## + SibSp.x1     1   529.93 557.93
## + Parch.x3     1   530.06 558.06
## + SibSp.x2     1   530.06 558.06
## + Parch.x2     1   530.11 558.11
## + Fare         1   530.12 558.12
## - SibSp.x8     1   535.70 559.70
## - SibSp.x4     1   535.73 559.73
## - Embarked.xS  1   536.49 560.49
## - Pclass.x2    1   538.00 562.00
## - Age          1   539.89 563.89
## - Pclass.x3    1   576.40 600.40
## - Sex          1   701.78 725.78
## 
## Step:  AIC=555.8
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x3 + SibSp.x4 + 
##     SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x5 + Embarked.xS
## 
##               Df Deviance    AIC
## <none>             531.80 555.80
## + Parch.x4     1   530.13 556.13
## - Parch.x5     1   534.46 556.46
## + Parch.x6     1   530.57 556.57
## - Parch.x1     1   534.65 556.65
## - SibSp.x3     1   534.77 556.77
## - SibSp.x5     1   534.87 556.87
## + Embarked.xQ  1   531.48 557.48
## + Embarked.xC  1   531.59 557.59
## + SibSp.x1     1   531.69 557.69
## + Parch.x3     1   531.71 557.71
## + SibSp.x2     1   531.73 557.73
## + Fare         1   531.74 557.74
## + Parch.x2     1   531.78 557.78
## - SibSp.x8     1   537.27 559.27
## - SibSp.x4     1   537.46 559.46
## - Embarked.xS  1   538.45 560.45
## - Pclass.x2    1   539.64 561.64
## - Age          1   542.57 564.57
## - Pclass.x3    1   579.16 601.16
## - Sex          1   703.10 725.10
summary(modelo_2)
## 
## Call:
## glm(formula = Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + 
##     SibSp.x3 + SibSp.x4 + SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x5 + 
##     Embarked.xS, family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4969  -0.6026  -0.4110   0.5814   2.6251  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    2.8487     0.3319   8.584  < 2e-16 ***
## Age           -0.4168     0.1306  -3.191  0.00142 ** 
## Pclass.x2     -0.8834     0.3191  -2.768  0.00563 ** 
## Pclass.x3     -1.9410     0.2948  -6.585 4.55e-11 ***
## Sex           -2.7116     0.2303 -11.775  < 2e-16 ***
## SibSp.x3      -1.3979     0.8262  -1.692  0.09065 .  
## SibSp.x4      -1.8003     0.8478  -2.124  0.03370 *  
## SibSp.x5     -15.8263  1049.1566  -0.015  0.98796    
## SibSp.x8     -15.8633   813.1959  -0.020  0.98444    
## Parch.x1       0.5452     0.3238   1.684  0.09221 .  
## Parch.x5     -15.7997  1186.8256  -0.013  0.98938    
## Embarked.xS   -0.6528     0.2527  -2.583  0.00978 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 829.6  on 622  degrees of freedom
## Residual deviance: 531.8  on 611  degrees of freedom
## AIC: 555.8
## 
## Number of Fisher Scoring iterations: 15
modelo_3 <- glm(formula = Survived ~ Age+Pclass.x2+Pclass.x3+Sex +Embarked.xS, family = "binomial", data =train)
summary(modelo_3)
## 
## Call:
## glm(formula = Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + 
##     Embarked.xS, family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4146  -0.6634  -0.3858   0.6350   2.4814  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.8842     0.3221   8.954  < 2e-16 ***
## Age          -0.3608     0.1201  -3.004  0.00267 ** 
## Pclass.x2    -0.7509     0.3101  -2.422  0.01544 *  
## Pclass.x3    -2.0760     0.2901  -7.157 8.28e-13 ***
## Sex          -2.6236     0.2223 -11.801  < 2e-16 ***
## Embarked.xS  -0.7829     0.2466  -3.174  0.00150 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 829.60  on 622  degrees of freedom
## Residual deviance: 552.27  on 617  degrees of freedom
## AIC: 564.27
## 
## Number of Fisher Scoring iterations: 5
vif(modelo_3)
##         Age   Pclass.x2   Pclass.x3         Sex Embarked.xS 
##    1.197488    1.632673    1.847381    1.061364    1.058444

Evaluación del modelo

pred <- predict(modelo_3, newdata = test)
summary(pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.1978 -2.2903 -1.1172 -0.7101  0.5858  3.1714
test$prob <- pred
names(test)
##  [1] "Age"         "Fare"        "Survived"    "Pclass.x2"   "Pclass.x3"  
##  [6] "Sex"         "SibSp.x1"    "SibSp.x2"    "SibSp.x3"    "SibSp.x4"   
## [11] "SibSp.x5"    "SibSp.x8"    "Parch.x1"    "Parch.x2"    "Parch.x3"   
## [16] "Parch.x4"    "Parch.x5"    "Parch.x6"    "Embarked.xC" "Embarked.xQ"
## [21] "Embarked.xS" "prob"
# Usando 50% como punto de corte

pred_survived <- factor(ifelse(pred >= 0.32, 1, 0))

actual_survived <- factor(ifelse(test$Survived==1,1,0))

table(actual_survived,pred_survived)
##                pred_survived
## actual_survived   0   1
##               0 148  17
##               1  43  60
cutoff_survived <- factor(ifelse(pred >=0.32, "1","0"))
conf_final <- confusionMatrix(cutoff_survived, actual_survived, positive = "1")
accuracy <- conf_final$overall[1]
sensitivity <- conf_final$byClass[1]
specificity <- conf_final$byClass[2]
accuracy
##  Accuracy 
## 0.7761194
sensitivity
## Sensitivity 
##   0.5825243
specificity
## Specificity 
##   0.8969697
perform_fn <- function(cutoff) 
{
  predicted_survived <- factor(ifelse(pred >= cutoff, 1, 0))
  conf <- confusionMatrix(predicted_survived, actual_survived, positive = "1")
  accuray <- conf$overall[1]
  sensitivity <- conf$byClass[1]
  specificity <- conf$byClass[2]
  out <- t(as.matrix(c(sensitivity, specificity, accuray))) 
  colnames(out) <- c("sensitivity", "specificity", "accuracy")
  return(out)
}
options(repr.plot.width =8, repr.plot.height =6)
summary(pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.1978 -2.2903 -1.1172 -0.7101  0.5858  3.1714
s = seq(0.01,0.80,length=100)
OUT = matrix(0,100,3)

for(i in 1:100)
{
  OUT[i,] = perform_fn(s[i])
} 

plot(s, OUT[,1],xlab="Cutoff",ylab="Valor",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),
     type="l",lwd=2,axes=FALSE,col=2)
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
lines(s,OUT[,2],col="darkgreen",lwd=2)
lines(s,OUT[,3],col=4,lwd=2)
box()
legend("bottom",col=c(2,"darkgreen",4,"darkred"),text.font =3,inset = 0.02,
       box.lty=0,cex = 0.8, 
       lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy"))
abline(v = 0.32, col="red", lwd=1, lty=2)
axis(1, at = seq(0.1, 1, by = 0.1))

glm.roc <- roc(response = test$Survived, predictor = as.numeric(pred))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(glm.roc,      legacy.axes = TRUE, print.auc.y = 1.0, print.auc = TRUE)