work.train <- read.csv("/Users/sofia/Desktop/SPRING 2019/488/Titanic/train.csv")
work.test <- read.csv("/Users/sofia/Desktop/SPRING 2019/488/Titanic/test.csv")

work.test$Survived <- "NA"
all <- rbind(work.train, work.test)

# Family size categories (0, 1-2, 3+)
all$fam.size <- 0
all$fam.size[all$SibSp > 0] <- 1
all$fam.size[all$SibSp >= 3] <- 2

# flag missing ages so I can do a reasonability check later
all$miss.age <- 1
all$miss.age[all$Age > 0] <- 0

# T/F flag to ensure that passengers with "Master" in their name have imputed age < 18
# T/F flag for Mr./Mrs. We want all of those imputed ages to at least be over > 18
chars <- all$Name
value <- "Master"; mr <- "Mr."; mrs <- "Mrs."; miss <- "Miss."
master.flag<-grepl(value, chars); mr.flag<-grepl(mr, chars); mrs.flag<-grepl(mrs, chars); miss.flag<-grepl(miss, chars)
all$master<-master.flag; all$mr<-mr.flag; all$mrs<-mrs.flag; all$miss<-miss.flag

 
library(mice)
## Loading required package: lattice
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(tree)

# Exclude certain variables from multiple imputation
Matrix = mice(all, maxit=0) 
## Warning: Number of logged events: 1
predM = Matrix$predictorMatrix
predM[, c("PassengerId", "Name", "Ticket", "Cabin", "miss.age", "fam.size")]=0   


# Get the final data-frame with imputed values filled in 'Age'
imp<-mice(all, predictorMatrix=predM, m=5)
## 
##  iter imp variable
##   1   1  Age  Fare
##   1   2  Age  Fare
##   1   3  Age  Fare
##   1   4  Age  Fare
##   1   5  Age  Fare
##   2   1  Age  Fare
##   2   2  Age  Fare
##   2   3  Age  Fare
##   2   4  Age  Fare
##   2   5  Age  Fare
##   3   1  Age  Fare
##   3   2  Age  Fare
##   3   3  Age  Fare
##   3   4  Age  Fare
##   3   5  Age  Fare
##   4   1  Age  Fare
##   4   2  Age  Fare
##   4   3  Age  Fare
##   4   4  Age  Fare
##   4   5  Age  Fare
##   5   1  Age  Fare
##   5   2  Age  Fare
##   5   3  Age  Fare
##   5   4  Age  Fare
##   5   5  Age  Fare
imputed <- complete(imp)
dat.train<-imputed[1:891,1:13]
dat.test <-imputed[892:1309,1:13]


# Tree Model
tree <- tree(Survived ~ Pclass + Sex + Age + Parch + Fare + Embarked + fam.size, data=dat.train)
predict.tree <- predict(tree, dat.test)

# Flag any predicted probability > .05 as survived
dat.test$Survived<-0
dat.test$Survived[predict.tree > 0.5] <- 1

# MSE
mean((predict.tree-dat.test$Survived)^2)
## [1] 0.04890042
tree.out <- dat.test[,1:2]
write.csv(tree.out, file="//Users/sofia/Desktop/SPRING 2019/488/Titanic/Titanic predictions.csv")