# install.packages("reticulate")
library(reticulate)
use_python("C:/Users/Cesar Corregidor/anaconda3/python.exe")INSTRUCCIONES
Determine si es de su interéss someter sus eventuales resultados a la plataforma Kaggle, en el desafío Titanic.
Si le resulta interesante la idea de someter sus resultados a Kaggle, utilice los datos dados aquí En este caso, el entregable constará únicamente del comprobante de la sumisión junto con los resultados dados por la plataforma.
Caso contrario, utilice los datos disponibles aquí Aquí, el entregable constará de un pdf Jupyter notebook con la presentación de su solución a los siguientes ítems
Plazo Máximo de entrega: Domingo 5 de Diciembre de 2021 (23h59).
data_r <- read.csv("D:/Cesar Corregidor/Documents/especializacion/ANALISIS_DE_REGRESION/Regresion-Aplicada-main/Talleres/Taller3/train.csv", sep = ",")
data <- read.csv("D:/Cesar Corregidor/Documents/especializacion/ANALISIS_DE_REGRESION/Regresion-Aplicada-main/Talleres/Taller3/train.csv", sep = ",")
library(DT)
DT::datatable(data)str(data)## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
dim(data[1])## [1] 891 1
hist(data_r$Age)data_py = r_to_py(data_r)
data_py$head()## PassengerId Survived Pclass ... Fare Cabin Embarked
## 0 1 0 3 ... 7.2500 S
## 1 2 1 1 ... 71.2833 C85 C
## 2 3 1 3 ... 7.9250 S
## 3 4 1 1 ... 53.1000 C123 S
## 4 5 0 3 ... 8.0500 S
##
## [5 rows x 12 columns]
data_py$isnull()$sum()## PassengerId 0
## Survived 0
## Pclass 0
## Name 0
## Sex 0
## Age 177
## SibSp 0
## Parch 0
## Ticket 0
## Fare 0
## Cabin 0
## Embarked 0
## dtype: int64
data_r = py_to_r(data_py)library(tidyr)
data_r = data_r %>% drop_na()
dim(data_r)[1]## [1] 714
mediana = median(data_r$Age)
mediana## [1] 28
data[is.na(data)] <- mediana
hist(data$Age)Considere la base de datos Titanic seleccionada. A partir de la misma resuelva:
- Realice una corta presentación del conjunto de datos asegurándose de entender cada variable.
| Variable | descripción |
|---|---|
| passengerId | Id del pasajero |
| survival: | Supervivencia. 0 = No, 1 = Si |
| pclass: | Clase del tiquete 1 = 1st, 2 = 2nd, 3 = 3rd |
| name: | Nombre del pasajero |
| sex: | Género |
| Age: | Edad |
| sibsp: | Números de hermanos / cónyuges a bordo del Titanic |
| parch: | Número de padres / Niños a bordo del Titanic |
| ticket: | Número de ticket |
| fare: | Tarifa de pasajero |
| cabin: | Número de la cabina |
| embarked: | Puerto de embarque C = Cherbourg, Q = Queenstown, S = Southampton |
- Presente los principales resultados de un Análisis Descriptivo y Exploratorio de los datos. Haga énfasis en la relación que pueda existir entre las variables.
library(tidyverse)
library(MASS)
library(car)
library(e1071)
library(caret)
library(cowplot)
library(caTools)
library(pROC)
library(ggcorrplot)## Warning: package 'ggcorrplot' was built under R version 4.1.2
glimpse(data)## Rows: 891
## Columns: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
## $ Survived <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1~
## $ Pclass <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3~
## $ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl~
## $ Sex <chr> "male", "female", "female", "female", "male", "male", "mal~
## $ Age <dbl> 22, 38, 26, 35, 35, 28, 54, 2, 27, 14, 4, 58, 20, 39, 14, ~
## $ SibSp <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0~
## $ Parch <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0~
## $ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37~
## $ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,~
## $ Cabin <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C~
## $ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"~
Las variables Survived, Pclass, Sex, SibSp, Parch y Embarked las convertimos en variables categóricas
data$Survived=factor(data$Survived)
data$Sex=factor(data$Sex)
data$Pclass=factor(data$Pclass)
data$SibSp=factor(data$SibSp)
data$Parch=factor(data$Parch)
data$Embarked=factor(data$Embarked)
data$Ticket=factor(data$Ticket)
data$Cabin=factor(data$Cabin)
#data %>% filter(Embarked=="")theme1 <- theme_bw()+
theme(axis.text.x = element_text(angle = 0, hjust = 1, vjust = 0.5),legend.position="none")
theme2 <- theme_bw()+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),legend.position="none")options(repr.plot.width = 8, repr.plot.height = 5)
plot_grid(ggplot(data, aes(x=Pclass,fill=Survived))+ geom_bar()+theme1,
ggplot(data, aes(x=Sex,fill=Survived))+ geom_bar()+theme1,
ggplot(data, aes(x=SibSp,fill=Survived))+ geom_bar()+theme1,
ggplot(data, aes(x=Parch,fill=Survived))+ geom_bar()+theme1,
ggplot(data, aes(x=Embarked,fill=Survived))+ geom_bar()+theme1+
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)), align = "h") - Pclass: Existe una proporción homogénea para el de tipo 2; para el ticket de clase 1 se salvaron más de la mitad de esa población en contraste con los de clase 3 donde la proporción de los que no se salvaron es mayor a los que si lo hicieron.
Sex: Cerca del 80% de la población femenina sobrevivió. A diferencia del género másculino donde la proporción de supervivencia es menor. Es importante tener en cuenta que habían más hombres que mujeres.
En SibSp: Se observa que a mayor hermanos o cónyugues(familiares) la tasa de mortalidad aumenta.
En Parch: Aquellas personas que no tenían padres o hijos a bordo la proporción de supervivencia es inferior.
Embarked: Las personas que embarcaron en Queenstown y Southampton tienen una proporción de supervivencia semejante donde los que no sobrevivieron es más del 50%. En cambio para el puerto Cherbourg la proporción de supervencia es mayor.
Realicemos el análisis para las variables cuantitativas
options(repr.plot.width =10, repr.plot.height = 5)
ggplot(data, aes(y= Fare, x = "", fill = Survived)) +
geom_boxplot()+
theme_bw()+
xlab(" ") Según el boxplot para las personas que no sobrevivieron si afectó bastante la tarifa del pasajero puesto que la mediana está muy cerca del percentil 25 indicando que hay una fuerte concentración de datos en este intervalo. Si bien el comportamiento para las personas que si sobrevivieron tiende a ser el mismo la distribución es un poco más homogenea. También podemos decir que existieron personas que no sobrevivieron a pesar de que la tarija de pasajero era alta
options(repr.plot.width =10, repr.plot.height = 5)
ggplot(data, aes(y= Age, x = "", fill = Survived)) +
geom_boxplot()+
theme_bw()+
xlab(" ") Podemos decir que hubo personas de avanzada edad y niños que no sobrevivieron. La mediana para las personas que se salvaron y las que no son cercanas y la media tiende a ser la mediana.
options(repr.plot.width =6, repr.plot.height = 4)
data_corr <- round(cor(data[,c("Age", "Fare")]), 1)
ggcorrplot(data_corr, title = "Correlation")+theme(plot.title = element_text(hjust = 0.5))No existe correlación entre las variables cuantitativas(Fare y Age)
num_columns <- c("Age", "Fare")
data[num_columns] <- sapply(data[num_columns], as.numeric)
data_int <- data[,c("Age", "Fare")]
data_int <- data.frame(scale(data_int))data_cat <- data[,-c(1,4,6,9,10,11)]
names(data)## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Cabin" "Embarked"
dummy<- data.frame(sapply(data_cat,function(x) data.frame(model.matrix(~x-1,data =data_cat))[,-1]))
head(dummy)## Survived Pclass.x2 Pclass.x3 Sex SibSp.x1 SibSp.x2 SibSp.x3 SibSp.x4 SibSp.x5
## 1 0 0 1 1 1 0 0 0 0
## 2 1 0 0 0 1 0 0 0 0
## 3 1 0 1 0 0 0 0 0 0
## 4 1 0 0 0 1 0 0 0 0
## 5 0 0 1 1 0 0 0 0 0
## 6 0 0 1 1 0 0 0 0 0
## SibSp.x8 Parch.x1 Parch.x2 Parch.x3 Parch.x4 Parch.x5 Parch.x6 Embarked.xC
## 1 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 1
## 3 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0
## Embarked.xQ Embarked.xS
## 1 0 1
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 1
## 6 1 0
data_final <- cbind(data_int,dummy)
head(data_final)## Age Fare Survived Pclass.x2 Pclass.x3 Sex SibSp.x1 SibSp.x2
## 1 -0.5654189 -0.5021631 0 0 1 1 1 0
## 2 0.6634884 0.7864036 1 0 0 0 1 0
## 3 -0.2581921 -0.4885799 1 0 1 0 0 0
## 4 0.4330683 0.4204941 1 0 0 0 1 0
## 5 0.4330683 -0.4860644 0 0 1 1 0 0
## 6 -0.1045787 -0.4778481 0 0 1 1 0 0
## SibSp.x3 SibSp.x4 SibSp.x5 SibSp.x8 Parch.x1 Parch.x2 Parch.x3 Parch.x4
## 1 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0
## Parch.x5 Parch.x6 Embarked.xC Embarked.xQ Embarked.xS
## 1 0 0 0 0 1
## 2 0 0 1 0 0
## 3 0 0 0 0 1
## 4 0 0 0 0 1
## 5 0 0 0 0 1
## 6 0 0 0 1 0
set.seed(123)
indices = sample.split(data_final$Survived, SplitRatio = 0.7)
train = data_final[indices,]
test = data_final[!(indices),]modelo saturado
modelo_1 = glm(Survived ~ ., data = train, family = "binomial")
summary(modelo_1)##
## Call:
## glm(formula = Survived ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4785 -0.5893 -0.4082 0.5815 2.6225
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 18.25576 2759.33446 0.007 0.99472
## Age -0.40331 0.13639 -2.957 0.00311 **
## Fare -0.01066 0.14598 -0.073 0.94178
## Pclass.x2 -0.86875 0.35991 -2.414 0.01579 *
## Pclass.x3 -1.86853 0.36357 -5.139 2.76e-07 ***
## Sex -2.72832 0.24210 -11.269 < 2e-16 ***
## SibSp.x1 0.13173 0.27151 0.485 0.62756
## SibSp.x2 -0.09308 0.60630 -0.154 0.87799
## SibSp.x3 -1.33359 0.84845 -1.572 0.11600
## SibSp.x4 -1.69611 0.89442 -1.896 0.05792 .
## SibSp.x5 -16.74814 1727.51516 -0.010 0.99226
## SibSp.x8 -16.79883 1340.16276 -0.013 0.99000
## Parch.x1 0.47579 0.34906 1.363 0.17285
## Parch.x2 -0.09819 0.43401 -0.226 0.82101
## Parch.x3 0.19690 1.09001 0.181 0.85665
## Parch.x4 -16.42203 2010.04393 -0.008 0.99348
## Parch.x5 -16.93578 1965.65382 -0.009 0.99313
## Parch.x6 -17.57204 3956.18035 -0.004 0.99646
## Embarked.xC -15.37401 2759.33447 -0.006 0.99555
## Embarked.xQ -15.64313 2759.33449 -0.006 0.99548
## Embarked.xS -16.08735 2759.33447 -0.006 0.99535
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 829.60 on 622 degrees of freedom
## Residual deviance: 527.68 on 602 degrees of freedom
## AIC: 569.68
##
## Number of Fisher Scoring iterations: 16
modelo_2 <- stepAIC(modelo_1, direction="both")## Start: AIC=569.68
## Survived ~ Age + Fare + Pclass.x2 + Pclass.x3 + Sex + SibSp.x1 +
## SibSp.x2 + SibSp.x3 + SibSp.x4 + SibSp.x5 + SibSp.x8 + Parch.x1 +
## Parch.x2 + Parch.x3 + Parch.x4 + Parch.x5 + Parch.x6 + Embarked.xC +
## Embarked.xQ + Embarked.xS
##
## Df Deviance AIC
## - Fare 1 527.69 567.69
## - SibSp.x2 1 527.71 567.71
## - Parch.x3 1 527.72 567.72
## - Parch.x2 1 527.74 567.74
## - SibSp.x1 1 527.92 567.92
## - Embarked.xC 1 528.11 568.11
## - Embarked.xQ 1 528.22 568.22
## - Embarked.xS 1 528.50 568.50
## - Parch.x6 1 529.04 569.04
## - Parch.x4 1 529.47 569.47
## - Parch.x1 1 529.55 569.55
## <none> 527.68 569.68
## - SibSp.x3 1 530.23 570.23
## - SibSp.x5 1 530.46 570.46
## - Parch.x5 1 530.58 570.58
## - SibSp.x4 1 532.04 572.04
## - SibSp.x8 1 532.45 572.45
## - Pclass.x2 1 533.54 573.54
## - Age 1 536.89 576.89
## - Pclass.x3 1 553.98 593.98
## - Sex 1 685.96 725.96
##
## Step: AIC=567.69
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x1 + SibSp.x2 +
## SibSp.x3 + SibSp.x4 + SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x2 +
## Parch.x3 + Parch.x4 + Parch.x5 + Parch.x6 + Embarked.xC +
## Embarked.xQ + Embarked.xS
##
## Df Deviance AIC
## - SibSp.x2 1 527.72 565.72
## - Parch.x3 1 527.72 565.72
## - Parch.x2 1 527.75 565.75
## - SibSp.x1 1 527.92 565.92
## - Embarked.xC 1 528.12 566.12
## - Embarked.xQ 1 528.23 566.23
## - Embarked.xS 1 528.50 566.50
## - Parch.x6 1 529.06 567.06
## - Parch.x4 1 529.51 567.51
## - Parch.x1 1 529.56 567.56
## <none> 527.69 567.69
## - SibSp.x3 1 530.32 568.32
## - SibSp.x5 1 530.47 568.47
## - Parch.x5 1 530.61 568.61
## + Fare 1 527.68 569.68
## - SibSp.x4 1 532.05 570.05
## - SibSp.x8 1 532.52 570.52
## - Pclass.x2 1 534.84 572.84
## - Age 1 536.91 574.91
## - Pclass.x3 1 566.17 604.17
## - Sex 1 686.14 724.14
##
## Step: AIC=565.72
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x1 + SibSp.x3 +
## SibSp.x4 + SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x2 + Parch.x3 +
## Parch.x4 + Parch.x5 + Parch.x6 + Embarked.xC + Embarked.xQ +
## Embarked.xS
##
## Df Deviance AIC
## - Parch.x3 1 527.74 563.74
## - Parch.x2 1 527.78 563.78
## - SibSp.x1 1 527.98 563.98
## - Embarked.xC 1 528.14 564.14
## - Embarked.xQ 1 528.26 564.26
## - Embarked.xS 1 528.53 564.53
## - Parch.x6 1 529.09 565.09
## - Parch.x4 1 529.55 565.55
## - Parch.x1 1 529.56 565.56
## <none> 527.72 565.72
## - SibSp.x3 1 530.32 566.32
## - SibSp.x5 1 530.48 566.48
## - Parch.x5 1 530.64 566.64
## + SibSp.x2 1 527.69 567.69
## + Fare 1 527.71 567.71
## - SibSp.x4 1 532.05 568.05
## - SibSp.x8 1 532.52 568.52
## - Pclass.x2 1 534.89 570.89
## - Age 1 536.91 572.91
## - Pclass.x3 1 566.18 602.18
## - Sex 1 686.24 722.24
##
## Step: AIC=563.74
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x1 + SibSp.x3 +
## SibSp.x4 + SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x2 + Parch.x4 +
## Parch.x5 + Parch.x6 + Embarked.xC + Embarked.xQ + Embarked.xS
##
## Df Deviance AIC
## - Parch.x2 1 527.82 561.82
## - SibSp.x1 1 528.03 562.03
## - Embarked.xC 1 528.17 562.17
## - Embarked.xQ 1 528.28 562.28
## - Embarked.xS 1 528.56 562.56
## - Parch.x6 1 529.13 563.13
## - Parch.x1 1 529.56 563.56
## - Parch.x4 1 529.59 563.59
## <none> 527.74 563.74
## - SibSp.x3 1 530.35 564.35
## - SibSp.x5 1 530.50 564.50
## - Parch.x5 1 530.69 564.69
## + Parch.x3 1 527.72 565.72
## + SibSp.x2 1 527.72 565.72
## + Fare 1 527.74 565.74
## - SibSp.x4 1 532.06 566.06
## - SibSp.x8 1 532.55 566.55
## - Pclass.x2 1 534.89 568.89
## - Age 1 536.91 570.91
## - Pclass.x3 1 566.31 600.31
## - Sex 1 688.25 722.25
##
## Step: AIC=561.82
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x1 + SibSp.x3 +
## SibSp.x4 + SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x4 + Parch.x5 +
## Parch.x6 + Embarked.xC + Embarked.xQ + Embarked.xS
##
## Df Deviance AIC
## - SibSp.x1 1 528.09 560.09
## - Embarked.xC 1 528.25 560.25
## - Embarked.xQ 1 528.35 560.35
## - Embarked.xS 1 528.63 560.63
## - Parch.x6 1 529.19 561.19
## - Parch.x4 1 529.66 561.66
## <none> 527.82 561.82
## - Parch.x1 1 529.90 561.90
## - SibSp.x3 1 530.63 562.63
## - Parch.x5 1 530.74 562.74
## - SibSp.x5 1 530.89 562.89
## + Parch.x2 1 527.74 563.74
## + Parch.x3 1 527.78 563.78
## + SibSp.x2 1 527.79 563.79
## + Fare 1 527.80 563.80
## - SibSp.x4 1 532.96 564.96
## - SibSp.x8 1 533.40 565.40
## - Pclass.x2 1 534.92 566.92
## - Age 1 537.04 569.04
## - Pclass.x3 1 566.45 598.45
## - Sex 1 692.92 724.92
##
## Step: AIC=560.09
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x3 + SibSp.x4 +
## SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x4 + Parch.x5 + Parch.x6 +
## Embarked.xC + Embarked.xQ + Embarked.xS
##
## Df Deviance AIC
## - Embarked.xC 1 528.50 558.50
## - Embarked.xQ 1 528.61 558.61
## - Embarked.xS 1 528.87 558.87
## - Parch.x6 1 529.38 559.38
## - Parch.x4 1 529.80 559.80
## <none> 528.09 560.09
## - Parch.x1 1 530.77 560.77
## - Parch.x5 1 530.94 560.94
## - SibSp.x3 1 531.15 561.15
## - SibSp.x5 1 531.23 561.23
## + SibSp.x1 1 527.82 561.82
## + Parch.x2 1 528.03 562.03
## + SibSp.x2 1 528.03 562.03
## + Parch.x3 1 528.04 562.04
## + Fare 1 528.08 562.08
## - SibSp.x4 1 533.56 563.56
## - SibSp.x8 1 533.82 563.82
## - Pclass.x2 1 535.25 565.25
## - Age 1 537.28 567.28
## - Pclass.x3 1 567.77 597.77
## - Sex 1 699.14 729.14
##
## Step: AIC=558.5
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x3 + SibSp.x4 +
## SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x4 + Parch.x5 + Parch.x6 +
## Embarked.xQ + Embarked.xS
##
## Df Deviance AIC
## - Embarked.xQ 1 528.86 556.86
## - Parch.x6 1 529.80 557.80
## - Parch.x4 1 530.22 558.22
## <none> 528.50 558.50
## - Parch.x1 1 531.16 559.16
## - Parch.x5 1 531.36 559.36
## - SibSp.x3 1 531.56 559.56
## - SibSp.x5 1 531.63 559.63
## + Embarked.xC 1 528.09 560.09
## + SibSp.x1 1 528.25 560.25
## + Parch.x2 1 528.43 560.43
## + SibSp.x2 1 528.44 560.44
## + Parch.x3 1 528.45 560.45
## + Fare 1 528.49 560.49
## - SibSp.x4 1 533.93 561.93
## - SibSp.x8 1 534.24 562.24
## - Embarked.xS 1 534.47 562.47
## - Pclass.x2 1 535.72 563.72
## - Age 1 537.54 565.54
## - Pclass.x3 1 568.38 596.38
## - Sex 1 701.18 729.18
##
## Step: AIC=556.86
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x3 + SibSp.x4 +
## SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x4 + Parch.x5 + Parch.x6 +
## Embarked.xS
##
## Df Deviance AIC
## - Parch.x6 1 530.13 556.13
## - Parch.x4 1 530.57 556.57
## <none> 528.86 556.86
## - Parch.x1 1 531.67 557.67
## - Parch.x5 1 531.67 557.67
## - SibSp.x3 1 531.89 557.89
## - SibSp.x5 1 531.95 557.95
## + Embarked.xQ 1 528.50 558.50
## + SibSp.x1 1 528.57 558.57
## + Embarked.xC 1 528.61 558.61
## + SibSp.x2 1 528.79 558.79
## + Parch.x3 1 528.80 558.80
## + Parch.x2 1 528.84 558.84
## + Fare 1 528.86 558.86
## - SibSp.x4 1 534.47 560.47
## - SibSp.x8 1 534.51 560.51
## - Embarked.xS 1 535.01 561.01
## - Pclass.x2 1 536.69 562.69
## - Age 1 538.09 564.09
## - Pclass.x3 1 574.06 600.06
## - Sex 1 701.50 727.50
##
## Step: AIC=556.13
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x3 + SibSp.x4 +
## SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x4 + Parch.x5 + Embarked.xS
##
## Df Deviance AIC
## - Parch.x4 1 531.80 555.80
## <none> 530.13 556.13
## + Parch.x6 1 528.86 556.86
## - Parch.x5 1 532.88 556.88
## - Parch.x1 1 532.96 556.96
## - SibSp.x3 1 533.13 557.13
## - SibSp.x5 1 533.21 557.21
## + Embarked.xQ 1 529.80 557.80
## + Embarked.xC 1 529.90 557.90
## + SibSp.x1 1 529.93 557.93
## + Parch.x3 1 530.06 558.06
## + SibSp.x2 1 530.06 558.06
## + Parch.x2 1 530.11 558.11
## + Fare 1 530.12 558.12
## - SibSp.x8 1 535.70 559.70
## - SibSp.x4 1 535.73 559.73
## - Embarked.xS 1 536.49 560.49
## - Pclass.x2 1 538.00 562.00
## - Age 1 539.89 563.89
## - Pclass.x3 1 576.40 600.40
## - Sex 1 701.78 725.78
##
## Step: AIC=555.8
## Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex + SibSp.x3 + SibSp.x4 +
## SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x5 + Embarked.xS
##
## Df Deviance AIC
## <none> 531.80 555.80
## + Parch.x4 1 530.13 556.13
## - Parch.x5 1 534.46 556.46
## + Parch.x6 1 530.57 556.57
## - Parch.x1 1 534.65 556.65
## - SibSp.x3 1 534.77 556.77
## - SibSp.x5 1 534.87 556.87
## + Embarked.xQ 1 531.48 557.48
## + Embarked.xC 1 531.59 557.59
## + SibSp.x1 1 531.69 557.69
## + Parch.x3 1 531.71 557.71
## + SibSp.x2 1 531.73 557.73
## + Fare 1 531.74 557.74
## + Parch.x2 1 531.78 557.78
## - SibSp.x8 1 537.27 559.27
## - SibSp.x4 1 537.46 559.46
## - Embarked.xS 1 538.45 560.45
## - Pclass.x2 1 539.64 561.64
## - Age 1 542.57 564.57
## - Pclass.x3 1 579.16 601.16
## - Sex 1 703.10 725.10
summary(modelo_2)##
## Call:
## glm(formula = Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex +
## SibSp.x3 + SibSp.x4 + SibSp.x5 + SibSp.x8 + Parch.x1 + Parch.x5 +
## Embarked.xS, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4969 -0.6026 -0.4110 0.5814 2.6251
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.8487 0.3319 8.584 < 2e-16 ***
## Age -0.4168 0.1306 -3.191 0.00142 **
## Pclass.x2 -0.8834 0.3191 -2.768 0.00563 **
## Pclass.x3 -1.9410 0.2948 -6.585 4.55e-11 ***
## Sex -2.7116 0.2303 -11.775 < 2e-16 ***
## SibSp.x3 -1.3979 0.8262 -1.692 0.09065 .
## SibSp.x4 -1.8003 0.8478 -2.124 0.03370 *
## SibSp.x5 -15.8263 1049.1566 -0.015 0.98796
## SibSp.x8 -15.8633 813.1959 -0.020 0.98444
## Parch.x1 0.5452 0.3238 1.684 0.09221 .
## Parch.x5 -15.7997 1186.8256 -0.013 0.98938
## Embarked.xS -0.6528 0.2527 -2.583 0.00978 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 829.6 on 622 degrees of freedom
## Residual deviance: 531.8 on 611 degrees of freedom
## AIC: 555.8
##
## Number of Fisher Scoring iterations: 15
modelo_3 <- glm(formula = Survived ~ Age+Pclass.x2+Pclass.x3+Sex +Embarked.xS, family = "binomial", data =train)summary(modelo_3)##
## Call:
## glm(formula = Survived ~ Age + Pclass.x2 + Pclass.x3 + Sex +
## Embarked.xS, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4146 -0.6634 -0.3858 0.6350 2.4814
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.8842 0.3221 8.954 < 2e-16 ***
## Age -0.3608 0.1201 -3.004 0.00267 **
## Pclass.x2 -0.7509 0.3101 -2.422 0.01544 *
## Pclass.x3 -2.0760 0.2901 -7.157 8.28e-13 ***
## Sex -2.6236 0.2223 -11.801 < 2e-16 ***
## Embarked.xS -0.7829 0.2466 -3.174 0.00150 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 829.60 on 622 degrees of freedom
## Residual deviance: 552.27 on 617 degrees of freedom
## AIC: 564.27
##
## Number of Fisher Scoring iterations: 5
vif(modelo_3)## Age Pclass.x2 Pclass.x3 Sex Embarked.xS
## 1.197488 1.632673 1.847381 1.061364 1.058444
Evaluación del modelo
pred <- predict(modelo_3, newdata = test)
summary(pred)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1978 -2.2903 -1.1172 -0.7101 0.5858 3.1714
test$prob <- pred
names(test)## [1] "Age" "Fare" "Survived" "Pclass.x2" "Pclass.x3"
## [6] "Sex" "SibSp.x1" "SibSp.x2" "SibSp.x3" "SibSp.x4"
## [11] "SibSp.x5" "SibSp.x8" "Parch.x1" "Parch.x2" "Parch.x3"
## [16] "Parch.x4" "Parch.x5" "Parch.x6" "Embarked.xC" "Embarked.xQ"
## [21] "Embarked.xS" "prob"
# Usando 50% como punto de corte
pred_survived <- factor(ifelse(pred >= 0.32, 1, 0))
actual_survived <- factor(ifelse(test$Survived==1,1,0))
table(actual_survived,pred_survived)## pred_survived
## actual_survived 0 1
## 0 148 17
## 1 43 60
cutoff_survived <- factor(ifelse(pred >=0.32, "1","0"))
conf_final <- confusionMatrix(cutoff_survived, actual_survived, positive = "1")
accuracy <- conf_final$overall[1]
sensitivity <- conf_final$byClass[1]
specificity <- conf_final$byClass[2]
accuracy## Accuracy
## 0.7761194
sensitivity## Sensitivity
## 0.5825243
specificity## Specificity
## 0.8969697
perform_fn <- function(cutoff)
{
predicted_survived <- factor(ifelse(pred >= cutoff, 1, 0))
conf <- confusionMatrix(predicted_survived, actual_survived, positive = "1")
accuray <- conf$overall[1]
sensitivity <- conf$byClass[1]
specificity <- conf$byClass[2]
out <- t(as.matrix(c(sensitivity, specificity, accuray)))
colnames(out) <- c("sensitivity", "specificity", "accuracy")
return(out)
}options(repr.plot.width =8, repr.plot.height =6)
summary(pred)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1978 -2.2903 -1.1172 -0.7101 0.5858 3.1714
s = seq(0.01,0.80,length=100)
OUT = matrix(0,100,3)
for(i in 1:100)
{
OUT[i,] = perform_fn(s[i])
}
plot(s, OUT[,1],xlab="Cutoff",ylab="Valor",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),
type="l",lwd=2,axes=FALSE,col=2)
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
lines(s,OUT[,2],col="darkgreen",lwd=2)
lines(s,OUT[,3],col=4,lwd=2)
box()
legend("bottom",col=c(2,"darkgreen",4,"darkred"),text.font =3,inset = 0.02,
box.lty=0,cex = 0.8,
lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy"))
abline(v = 0.32, col="red", lwd=1, lty=2)
axis(1, at = seq(0.1, 1, by = 0.1))glm.roc <- roc(response = test$Survived, predictor = as.numeric(pred))## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(glm.roc, legacy.axes = TRUE, print.auc.y = 1.0, print.auc = TRUE)