Brayan Iván Cruz Corona

13001595

Defino el directorio donde trabajare:

setwd("~/Documents/Econometria/Lab3")

Cargamos las librerias necesarias:

library(readr)
library(dplyr)
library(tidyr)

Cargamos las bases de datos de Train y Test:

trainDB <- read_csv("train.csv")
Parsed with column specification:
cols(
  PassengerId = col_integer(),
  Survived = col_integer(),
  Pclass = col_integer(),
  Name = col_character(),
  Sex = col_character(),
  Age = col_double(),
  SibSp = col_integer(),
  Parch = col_integer(),
  Ticket = col_character(),
  Fare = col_double(),
  Cabin = col_character(),
  Embarked = col_character()
)
testDB <- read_csv("test.csv")
Parsed with column specification:
cols(
  PassengerId = col_integer(),
  Pclass = col_integer(),
  Name = col_character(),
  Sex = col_character(),
  Age = col_double(),
  SibSp = col_integer(),
  Parch = col_integer(),
  Ticket = col_character(),
  Fare = col_double(),
  Cabin = col_character(),
  Embarked = col_character()
)
glimpse(trainDB)
Observations: 891
Variables: 12
$ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 2...
$ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,...
$ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3, 2, 2, 3,...
$ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Florence Bri...
$ Sex         <chr> "male", "female", "female", "female", "male", "male", "male", "male"...
$ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, 55, 2, NA,...
$ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0, 0, 0, 0,...
$ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
$ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "373450", "33...
$ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625, 21.0750, ...
$ Cabin       <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, "G6", "C103", NA, ...
$ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S", "S", "S"...

Modelo Inicial

Hombres que NO sobrevivieron y Mujeres que sí sobrevivieron

Se incluyen en en analisis solamente las columnas de “Sex”, “Survived” Se debe recordar que en la columna de “Survived” un 1 significa “Sobrevivió” y 0 “No sobrevivió”

trainDB %>%
  group_by(Sex,Survived) %>%
  summarise(n=n()) %>%
  left_join(train %>%
    group_by(Sex) %>%
    summarise(total_pasajeros_sexo=n())) %>%
  ungroup() %>%
  mutate(resultados_finales = round(n/total_pasajeros_sexo,2))
Joining, by = "Sex"

Modelo de regresion lineal con corte de 0.5

fit_reg <- lm(Survived ~ Pclass+Sex+Age, data = trainDB)
summary(fit_reg)

Call:
lm(formula = Survived ~ Pclass + Sex + Age, data = trainDB)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.11224 -0.25417 -0.06352  0.22700  1.00737 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.326066   0.062606  21.181  < 2e-16 ***
Pclass      -0.202916   0.018891 -10.741  < 2e-16 ***
Sexmale     -0.479293   0.030671 -15.627  < 2e-16 ***
Age         -0.005453   0.001082  -5.042 5.86e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3846 on 710 degrees of freedom
  (177 observations deleted due to missingness)
Multiple R-squared:  0.3902,    Adjusted R-squared:  0.3876 
F-statistic: 151.4 on 3 and 710 DF,  p-value: < 2.2e-16

Promedio de edad

mean(test$Age,na.rm=TRUE)

Mediana de edad

median(test$Age,na.rm=TRUE)
testDB$Age <- ifelse(is.na(test$Age),27,test$Age)
pred_reg <- predict(fit_reg,testDB)
mod2 <- cbind(testDB,Survived = pred_reg) %>% 
  select(PassengerId,Survived) %>% 
  mutate (Survived = if_else(Survived>=0.5,1,0))
mod2 %>% write_csv("modelo2_corte05.csv")

Exactitud aproximada de: 0.75

Modelo de regresion logistica con corte de 0.5

fit_logistica <- glm(Survived ~ Pclass+Sex+Age, data = trainDB, family = "binomial")
summary(fit_logistica)

Call:
glm(formula = Survived ~ Pclass + Sex + Age, family = "binomial", 
    data = trainDB)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7270  -0.6799  -0.3947   0.6483   2.4668  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)  5.056006   0.502128  10.069  < 2e-16 ***
Pclass      -1.288545   0.139259  -9.253  < 2e-16 ***
Sexmale     -2.522131   0.207283 -12.168  < 2e-16 ***
Age         -0.036929   0.007628  -4.841 1.29e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 964.52  on 713  degrees of freedom
Residual deviance: 647.29  on 710  degrees of freedom
  (177 observations deleted due to missingness)
AIC: 655.29

Number of Fisher Scoring iterations: 5
pred_logistica <- predict(fit_logistica,testDB,type="response")
Sobrevivientes <- ifelse(pred_logistica>0.5,1,0)
modelo_reg_log05 <- cbind(test,Sobrevivientes) %>% 
  dplyr::select(PassengerId,Sobrevivientes)
modelo_reg_log05 %>% 
  write_csv("modelo_reg_log05.csv")

Exactitud aproximada de: 0.74

Entonces podemos ver que la regresion lineal con corte de 0.5 es una de las mejores opciones ya que tiene una exactitud de 0.75.

LS0tCnRpdGxlOiAiTGFib3JhdG9yaW8gMyAtIFRpdGFuaWMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiMjI0JyYXlhbiBJdsOhbiBDcnV6IENvcm9uYQojIyMxMzAwMTU5NQoKRGVmaW5vIGVsIGRpcmVjdG9yaW8gZG9uZGUgdHJhYmFqYXJlOgpgYGB7cn0Kc2V0d2QoIn4vRG9jdW1lbnRzL0Vjb25vbWV0cmlhL0xhYjMiKQpgYGAKCgpDYXJnYW1vcyBsYXMgbGlicmVyaWFzIG5lY2VzYXJpYXM6CgpgYGB7cn0KbGlicmFyeShyZWFkcikKbGlicmFyeShkcGx5cikKbGlicmFyeSh0aWR5cikKYGBgCgoKQ2FyZ2Ftb3MgbGFzIGJhc2VzIGRlIGRhdG9zIGRlIFRyYWluIHkgVGVzdDoKCmBgYHtyfQp0cmFpbkRCIDwtIHJlYWRfY3N2KCJ0cmFpbi5jc3YiKQp0ZXN0REIgPC0gcmVhZF9jc3YoInRlc3QuY3N2IikKYGBgCgpgYGB7cn0KZ2xpbXBzZSh0cmFpbkRCKQpgYGAKCiMjI01vZGVsbyBJbmljaWFsCgpIb21icmVzIHF1ZSBOTyBzb2JyZXZpdmllcm9uIHkgTXVqZXJlcyBxdWUgc8OtIHNvYnJldml2aWVyb24KClNlIGluY2x1eWVuIGVuIGVuIGFuYWxpc2lzIHNvbGFtZW50ZSBsYXMgY29sdW1uYXMgZGUgICJTZXgiLCAiU3Vydml2ZWQiClNlIGRlYmUgcmVjb3JkYXIgcXVlIGVuIGxhIGNvbHVtbmEgZGUgIlN1cnZpdmVkIiB1biAxIHNpZ25pZmljYSAiU29icmV2aXZpw7MiIHkgMCAiTm8gc29icmV2aXZpw7MiCgpgYGB7cn0KdHJhaW5EQiAlPiUKICBncm91cF9ieShTZXgsU3Vydml2ZWQpICU+JQogIHN1bW1hcmlzZShuPW4oKSkgJT4lCiAgbGVmdF9qb2luKHRyYWluICU+JQogICAgZ3JvdXBfYnkoU2V4KSAlPiUKICAgIHN1bW1hcmlzZSh0b3RhbF9wYXNhamVyb3Nfc2V4bz1uKCkpKSAlPiUKICB1bmdyb3VwKCkgJT4lCiAgbXV0YXRlKHJlc3VsdGFkb3NfZmluYWxlcyA9IHJvdW5kKG4vdG90YWxfcGFzYWplcm9zX3NleG8sMikpCmBgYAoKI01vZGVsbyBkZSByZWdyZXNpb24gbGluZWFsIGNvbiBjb3J0ZSBkZSAwLjUKCmBgYHtyfQpmaXRfcmVnIDwtIGxtKFN1cnZpdmVkIH4gUGNsYXNzK1NleCtBZ2UsIGRhdGEgPSB0cmFpbkRCKQpzdW1tYXJ5KGZpdF9yZWcpCmBgYAoKUHJvbWVkaW8gZGUgZWRhZAoKYGBge3J9Cm1lYW4odGVzdCRBZ2UsbmEucm09VFJVRSkKYGBgCgpNZWRpYW5hIGRlIGVkYWQKCmBgYHtyfQptZWRpYW4odGVzdCRBZ2UsbmEucm09VFJVRSkKYGBgCgpgYGB7cn0KdGVzdERCJEFnZSA8LSBpZmVsc2UoaXMubmEodGVzdCRBZ2UpLDI3LHRlc3QkQWdlKQpwcmVkX3JlZyA8LSBwcmVkaWN0KGZpdF9yZWcsdGVzdERCKQptb2QyIDwtIGNiaW5kKHRlc3REQixTdXJ2aXZlZCA9IHByZWRfcmVnKSAlPiUgCiAgc2VsZWN0KFBhc3NlbmdlcklkLFN1cnZpdmVkKSAlPiUgCiAgbXV0YXRlIChTdXJ2aXZlZCA9IGlmX2Vsc2UoU3Vydml2ZWQ+PTAuNSwxLDApKQptb2QyICU+JSB3cml0ZV9jc3YoIm1vZGVsbzJfY29ydGUwNS5jc3YiKQpgYGAKCkV4YWN0aXR1ZCBhcHJveGltYWRhIGRlOiAqMC43NSoKCgojTW9kZWxvIGRlIHJlZ3Jlc2lvbiBsb2dpc3RpY2EgY29uIGNvcnRlIGRlIDAuNQoKYGBge3J9CmZpdF9sb2dpc3RpY2EgPC0gZ2xtKFN1cnZpdmVkIH4gUGNsYXNzK1NleCtBZ2UsIGRhdGEgPSB0cmFpbkRCLCBmYW1pbHkgPSAiYmlub21pYWwiKQpzdW1tYXJ5KGZpdF9sb2dpc3RpY2EpCmBgYAoKYGBge3J9CnByZWRfbG9naXN0aWNhIDwtIHByZWRpY3QoZml0X2xvZ2lzdGljYSx0ZXN0REIsdHlwZT0icmVzcG9uc2UiKQpTb2JyZXZpdmllbnRlcyA8LSBpZmVsc2UocHJlZF9sb2dpc3RpY2E+MC41LDEsMCkKbW9kZWxvX3JlZ19sb2cwNSA8LSBjYmluZCh0ZXN0LFNvYnJldml2aWVudGVzKSAlPiUgCiAgZHBseXI6OnNlbGVjdChQYXNzZW5nZXJJZCxTb2JyZXZpdmllbnRlcykKbW9kZWxvX3JlZ19sb2cwNSAlPiUgCiAgd3JpdGVfY3N2KCJtb2RlbG9fcmVnX2xvZzA1LmNzdiIpCmBgYAoKRXhhY3RpdHVkIGFwcm94aW1hZGEgZGU6IDAuNzQKCgpFbnRvbmNlcyBwb2RlbW9zIHZlciBxdWUgbGEgcmVncmVzaW9uIGxpbmVhbCBjb24gY29ydGUgZGUgMC41IGVzIHVuYSBkZSBsYXMgbWVqb3JlcyBvcGNpb25lcyB5YSBxdWUgdGllbmUgdW5hIGV4YWN0aXR1ZCBkZSAwLjc1LgoKCg==