library(dplyr)
library(ggplot2)
library(Amelia)
Loading required package: Rcpp
##
## Amelia II: Multiple Imputation
## (Version 1.7.4, built: 2015-12-05)
## Copyright (C) 2005-2016 James Honaker, Gary King and Matthew Blackwell
## Refer to http://gking.harvard.edu/amelia/ for more information
##
train<-read.csv("train.csv",stringsAsFactors = FALSE, na.strings = "")
test<-read.csv("test.csv",stringsAsFactors = FALSE, na.strings = "")
names(train)
[1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
[6] "Age" "SibSp" "Parch" "Ticket" "Fare"
[11] "Cabin" "Embarked"
names(test)
[1] "PassengerId" "Pclass" "Name" "Sex" "Age"
[6] "SibSp" "Parch" "Ticket" "Fare" "Cabin"
[11] "Embarked"
glimpse(train)
Observations: 891
Variables: 12
$ PassengerId (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,...
$ Survived (int) 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1...
$ Pclass (int) 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3...
$ Name (chr) "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl...
$ Sex (chr) "male", "female", "female", "female", "male", "male", "mal...
$ Age (dbl) 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, ...
$ SibSp (int) 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0...
$ Parch (int) 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0...
$ Ticket (chr) "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37...
$ Fare (dbl) 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,...
$ Cabin (chr) NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, "G6", "C...
$ Embarked (chr) "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"...
table(train$Survived,dnn = c("Survived"))
Survived
0 1
549 342
prop.table(table(train$Survived,dnn = c("Survived")))
Survived
0 1
0.6161616 0.3838384
test$Survived<-rep(0,nrow(test))
glimpse(test)
Observations: 418
Variables: 12
$ PassengerId (int) 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903...
$ Pclass (int) 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 1, 1, 2, 1, 2, 2, 3, 3, 3...
$ Name (chr) "Kelly, Mr. James", "Wilkes, Mrs. James (Ellen Needs)", "M...
$ Sex (chr) "male", "female", "male", "male", "female", "male", "femal...
$ Age (dbl) 34.5, 47.0, 62.0, 27.0, 22.0, 14.0, 30.0, 26.0, 18.0, 21.0...
$ SibSp (int) 0, 1, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0...
$ Parch (int) 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ Ticket (chr) "330911", "363272", "240276", "315154", "3101298", "7538",...
$ Fare (dbl) 7.8292, 7.0000, 9.6875, 8.6625, 12.2875, 9.2250, 7.6292, 2...
$ Cabin (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "B45", NA,...
$ Embarked (chr) "Q", "S", "Q", "S", "S", "S", "Q", "S", "C", "S", "S", "S"...
$ Survived (dbl) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
prediccion1<-data.frame(PassengerId = test$PassengerId,
Survived = test$Survived)
prediccion1
write.csv(prediccion1, file="allperish.csv",row.names = FALSE)
table(train$Survived,train$Sex)
female male
0 81 468
1 233 109
\[Card(Sex_i \cap Survived_j )=\]
prop.table(table(train$Survived,train$Sex))
female male
0 0.09090909 0.52525253
1 0.26150393 0.12233446
\[P(Sex_i \cap Survived_j )\]
test$Survived<-ifelse(test = test$Sex=="female",
yes = 1,
no = 0)
prediccion2<-data.frame(PassengerId = test$PassengerId,
Survived = test$Survived)
write.csv(prediccion2, file = "swomen.csv",row.names = FALSE)
modelo1<-lm(data = train, formula = Survived ~ Pclass +
Sex + Age+ SibSp, na.action = na.omit )
summary(modelo1)
Call:
lm(formula = Survived ~ Pclass + Sex + Age + SibSp, data = train,
na.action = na.omit)
Residuals:
Min 1Q Median 3Q Max
-1.12371 -0.23934 -0.06484 0.22579 1.00073
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.396606 0.065609 21.287 < 2e-16 ***
Pclass -0.205267 0.018769 -10.937 < 2e-16 ***
Sexmale -0.486489 0.030527 -15.937 < 2e-16 ***
Age -0.006557 0.001123 -5.839 8.01e-09 ***
SibSp -0.054517 0.016230 -3.359 0.000824 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3818 on 709 degrees of freedom
(177 observations deleted due to missingness)
Multiple R-squared: 0.3997, Adjusted R-squared: 0.3963
F-statistic: 118 on 4 and 709 DF, p-value: < 2.2e-16
test<-read.csv("test.csv",stringsAsFactors = FALSE, na.strings = "")
prediccion3<-predict(object = modelo1,newdata = test)
summary(prediccion3)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
-0.2369 0.1566 0.3799 0.4312 0.6678 1.0540 86
hist(prediccion3)
prediccion3 <- ifelse(prediccion3>0.67, yes = 1, no = 0)
table(prediccion3)
prediccion3
0 1
337 81
prediccion3 <- ifelse(is.na(prediccion3),yes = 0, no=prediccion3)
sum(is.na(prediccion3))
[1] 0
prediccion3 <- data.frame(PassengerId = test$PassengerId,
Survived = prediccion3)
write.csv(prediccion3, file = "linreg.csv",row.names = FALSE)
prediccion3 %>%
arrange(Survived)%>%
ggplot( aes(x=PassengerId, y=Survived))+
geom_point()
missmap(train)
num_na <- function(x){
sum(is.na(x))
}
sapply(train,num_na)
PassengerId Survived Pclass Name Sex Age
0 0 0 0 0 177
SibSp Parch Ticket Fare Cabin Embarked
0 0 0 0 687 0
Title adult
0 0
num_unico<-function(x){
length(unique(x))
}
sapply(test,num_unico)
PassengerId Pclass Name Sex Age SibSp
418 3 418 2 80 7
Parch Ticket Fare Cabin Embarked
8 363 170 77 3
table(train$Embarked)
C Q S
168 77 644
train$Embarked<-ifelse(is.na(train$Embarked),yes = "S",no = train$Embarked)
names(train)
[1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
[6] "Age" "SibSp" "Parch" "Ticket" "Fare"
[11] "Cabin" "Embarked"
train$Name[1:10]
[1] "Braund, Mr. Owen Harris"
[2] "Cumings, Mrs. John Bradley (Florence Briggs Thayer)"
[3] "Heikkinen, Miss. Laina"
[4] "Futrelle, Mrs. Jacques Heath (Lily May Peel)"
[5] "Allen, Mr. William Henry"
[6] "Moran, Mr. James"
[7] "McCarthy, Mr. Timothy J"
[8] "Palsson, Master. Gosta Leonard"
[9] "Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)"
[10] "Nasser, Mrs. Nicholas (Adele Achem)"
train$Age[1:10]
[1] 22 38 26 35 35 NA 54 2 27 14
x<-strsplit(train$Name[1],split = ",")
x<-x[[1]][2]
x<-trimws(x)
x<-strsplit(x,split = "[.]")
x[[1]][1]
[1] "Mr"
y<-strsplit(train$Name[1],split = "[.,]")
y[[1]]
[1] "Braund" " Mr" " Owen Harris"
title<-c()
for(i in train$Name){
y<-strsplit(i,split = "[.,]")
y<-y[[1]][2]
title <- c(title,y)
}
title[1:15]
[1] " Mr" " Mrs" " Miss" " Mrs" " Mr" " Mr" " Mr"
[8] " Master" " Mrs" " Mrs" " Miss" " Miss" " Mr" " Mr"
[15] " Miss"
extract_title <- function(name){
y<-strsplit(name,split = "[.,]")
y<-y[[1]][2]
}
train$Title<-sapply(train$Name, FUN = extract_title,USE.NAMES = FALSE)
test$Title<-sapply(test$Name, FUN = extract_title,USE.NAMES = FALSE)
train$Title<-ifelse(trimws(train$Title) %in% c("Miss","Mlle", "Mme"),
yes = "Miss",
no = train$Title)
test$Title<-ifelse(trimws(test$Title) %in% c("Miss","Mlle", "Mme"),
yes = "Miss",
no = test$Title)
table(train$Title)
Capt Col Don Dr Jonkheer
1 2 1 7 1
Lady Major Master Mr Mrs
1 2 40 517 125
Ms Rev Sir the Countess Miss
1 6 1 1 185
train$adult<-ifelse(train$Age>12,yes = 1, no = 0)
test$adult<-ifelse(test$Age>12,yes = 1, no = 0)
table(train$Title,train$adult)
0 1
Capt 0 1
Col 0 2
Don 0 1
Dr 0 6
Jonkheer 0 1
Lady 0 1
Major 0 2
Master 36 0
Mr 1 397
Mrs 0 108
Ms 0 1
Rev 0 6
Sir 0 1
the Countess 0 1
Miss 32 117
train$adult<-ifelse(is.na(train$Age) & trimws(train$Title) == "Master" ,
yes = 0,
no = ifelse(is.na(train$Age),
yes = 1,
no = train$adult)
)
test$adult<-ifelse(is.na(test$Age) & trimws(test$Title) == "Master" ,
yes = 0,
no = ifelse(is.na(test$Age),
yes = 1,
no = test$adult)
)
modelo2<-lm(data = train, formula = Survived ~ Pclass +
Sex + adult+ SibSp+Embarked)
prediccion4<-predict(object = modelo2, newdata = test)
prediccion4 <- ifelse(prediccion4>0.5, yes = 1, no = 0)
prediccion4 <- ifelse(is.na(prediccion4),yes = 0, no=prediccion4)
prediccion4<-data.frame(PassengerId=test$PassengerId,
Survived=prediccion4)