work.train <- read.csv("/Users/sofia/Desktop/SPRING 2019/488/Titanic/train.csv")
work.test <- read.csv("/Users/sofia/Desktop/SPRING 2019/488/Titanic/test.csv")
work.test$Survived <- "NA"
all <- rbind(work.train, work.test)
# Family size categories (0, 1-2, 3+)
all$fam.size <- 0
all$fam.size[all$SibSp > 0] <- 1
all$fam.size[all$SibSp >= 3] <- 2
# flag missing ages so I can do a reasonability check later
all$miss.age <- 1
all$miss.age[all$Age > 0] <- 0
# T/F flag to ensure that passengers with "Master" in their name have imputed age < 18
# T/F flag for Mr./Mrs. We want all of those imputed ages to at least be over > 18
chars <- all$Name
value <- "Master"; mr <- "Mr."; mrs <- "Mrs."; miss <- "Miss."
master.flag<-grepl(value, chars); mr.flag<-grepl(mr, chars); mrs.flag<-grepl(mrs, chars); miss.flag<-grepl(miss, chars)
all$master<-master.flag; all$mr<-mr.flag; all$mrs<-mrs.flag; all$miss<-miss.flag
library(mice)
## Loading required package: lattice
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(tree)
# Exclude certain variables from multiple imputation
Matrix = mice(all, maxit=0)
## Warning: Number of logged events: 1
predM = Matrix$predictorMatrix
predM[, c("PassengerId", "Name", "Ticket", "Cabin", "miss.age", "fam.size")]=0
# Get the final data-frame with imputed values filled in 'Age'
imp<-mice(all, predictorMatrix=predM, m=5)
##
## iter imp variable
## 1 1 Age Fare
## 1 2 Age Fare
## 1 3 Age Fare
## 1 4 Age Fare
## 1 5 Age Fare
## 2 1 Age Fare
## 2 2 Age Fare
## 2 3 Age Fare
## 2 4 Age Fare
## 2 5 Age Fare
## 3 1 Age Fare
## 3 2 Age Fare
## 3 3 Age Fare
## 3 4 Age Fare
## 3 5 Age Fare
## 4 1 Age Fare
## 4 2 Age Fare
## 4 3 Age Fare
## 4 4 Age Fare
## 4 5 Age Fare
## 5 1 Age Fare
## 5 2 Age Fare
## 5 3 Age Fare
## 5 4 Age Fare
## 5 5 Age Fare
imputed <- complete(imp)
dat.train<-imputed[1:891,1:13]
dat.test <-imputed[892:1309,1:13]
# Tree Model
tree <- tree(Survived ~ Pclass + Sex + Age + Parch + Fare + Embarked + fam.size, data=dat.train)
predict.tree <- predict(tree, dat.test)
# Flag any predicted probability > .05 as survived
dat.test$Survived<-0
dat.test$Survived[predict.tree > 0.5] <- 1
# MSE
mean((predict.tree-dat.test$Survived)^2)
## [1] 0.04890042
tree.out <- dat.test[,1:2]
write.csv(tree.out, file="//Users/sofia/Desktop/SPRING 2019/488/Titanic/Titanic predictions.csv")