Defino el directorio donde trabajare:
setwd("~/Documents/Econometria/Lab3")
Cargamos las librerias necesarias:
library(readr)
library(dplyr)
library(tidyr)
Cargamos las bases de datos de Train y Test:
trainDB <- read_csv("train.csv")
Parsed with column specification:
cols(
PassengerId = col_integer(),
Survived = col_integer(),
Pclass = col_integer(),
Name = col_character(),
Sex = col_character(),
Age = col_double(),
SibSp = col_integer(),
Parch = col_integer(),
Ticket = col_character(),
Fare = col_double(),
Cabin = col_character(),
Embarked = col_character()
)
testDB <- read_csv("test.csv")
Parsed with column specification:
cols(
PassengerId = col_integer(),
Pclass = col_integer(),
Name = col_character(),
Sex = col_character(),
Age = col_double(),
SibSp = col_integer(),
Parch = col_integer(),
Ticket = col_character(),
Fare = col_double(),
Cabin = col_character(),
Embarked = col_character()
)
glimpse(trainDB)
Observations: 891
Variables: 12
$ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 2...
$ Survived <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,...
$ Pclass <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3, 2, 2, 3,...
$ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Florence Bri...
$ Sex <chr> "male", "female", "female", "female", "male", "male", "male", "male"...
$ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, 55, 2, NA,...
$ SibSp <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0, 0, 0, 0,...
$ Parch <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
$ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "373450", "33...
$ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625, 21.0750, ...
$ Cabin <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, "G6", "C103", NA, ...
$ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S", "S", "S"...
Hombres que NO sobrevivieron y Mujeres que sí sobrevivieron
Se incluyen en en analisis solamente las columnas de “Sex”, “Survived” Se debe recordar que en la columna de “Survived” un 1 significa “Sobrevivió” y 0 “No sobrevivió”
trainDB %>%
group_by(Sex,Survived) %>%
summarise(n=n()) %>%
left_join(train %>%
group_by(Sex) %>%
summarise(total_pasajeros_sexo=n())) %>%
ungroup() %>%
mutate(resultados_finales = round(n/total_pasajeros_sexo,2))
Joining, by = "Sex"
fit_reg <- lm(Survived ~ Pclass+Sex+Age, data = trainDB)
summary(fit_reg)
Call:
lm(formula = Survived ~ Pclass + Sex + Age, data = trainDB)
Residuals:
Min 1Q Median 3Q Max
-1.11224 -0.25417 -0.06352 0.22700 1.00737
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.326066 0.062606 21.181 < 2e-16 ***
Pclass -0.202916 0.018891 -10.741 < 2e-16 ***
Sexmale -0.479293 0.030671 -15.627 < 2e-16 ***
Age -0.005453 0.001082 -5.042 5.86e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3846 on 710 degrees of freedom
(177 observations deleted due to missingness)
Multiple R-squared: 0.3902, Adjusted R-squared: 0.3876
F-statistic: 151.4 on 3 and 710 DF, p-value: < 2.2e-16
Promedio de edad
mean(test$Age,na.rm=TRUE)
Mediana de edad
median(test$Age,na.rm=TRUE)
testDB$Age <- ifelse(is.na(test$Age),27,test$Age)
pred_reg <- predict(fit_reg,testDB)
mod2 <- cbind(testDB,Survived = pred_reg) %>%
select(PassengerId,Survived) %>%
mutate (Survived = if_else(Survived>=0.5,1,0))
mod2 %>% write_csv("modelo2_corte05.csv")
Exactitud aproximada de: 0.75
fit_logistica <- glm(Survived ~ Pclass+Sex+Age, data = trainDB, family = "binomial")
summary(fit_logistica)
Call:
glm(formula = Survived ~ Pclass + Sex + Age, family = "binomial",
data = trainDB)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.7270 -0.6799 -0.3947 0.6483 2.4668
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 5.056006 0.502128 10.069 < 2e-16 ***
Pclass -1.288545 0.139259 -9.253 < 2e-16 ***
Sexmale -2.522131 0.207283 -12.168 < 2e-16 ***
Age -0.036929 0.007628 -4.841 1.29e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 964.52 on 713 degrees of freedom
Residual deviance: 647.29 on 710 degrees of freedom
(177 observations deleted due to missingness)
AIC: 655.29
Number of Fisher Scoring iterations: 5
pred_logistica <- predict(fit_logistica,testDB,type="response")
Sobrevivientes <- ifelse(pred_logistica>0.5,1,0)
modelo_reg_log05 <- cbind(test,Sobrevivientes) %>%
dplyr::select(PassengerId,Sobrevivientes)
modelo_reg_log05 %>%
write_csv("modelo_reg_log05.csv")
Exactitud aproximada de: 0.74
Entonces podemos ver que la regresion lineal con corte de 0.5 es una de las mejores opciones ya que tiene una exactitud de 0.75.