train <- read.csv("/Users/sofia/Desktop/SPRING 2019/488/Titanic/train.csv")
test <- read.csv("/Users/sofia/Desktop/SPRING 2019/488/Titanic/test.csv")

work.train <- train
work.test <- test
work.test$Survived <- "NA"
all <- rbind(work.train, work.test)

# Family size categories (0, 1-2, 3+)
all$fam.size <- 0
all$fam.size[all$SibSp > 0] <- 1
all$fam.size[all$SibSp >= 3] <- 2

# flag missing ages so I can do a reasonability check later
all$miss.age <- 1
all$miss.age[all$Age > 0] <- 0

# T/F flag to ensure that passengers with "Master" in their name have imputed age < 18
# T/F flag for Mr./Mrs. We want all of those imputed ages to at least be over > 18
chars <- all$Name

value <- "Master"; mr <- "Mr."; mrs <- "Mrs."; miss <- "Miss."
master.flag<-grepl(value, chars); mr.flag<-grepl(mr, chars); mrs.flag<-grepl(mrs, chars); miss.flag<-grepl(miss, chars)
all$master<-master.flag; all$mr<-mr.flag; all$mrs<-mrs.flag; all$miss<-miss.flag


library(VIM) 
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(mice)
## Loading required package: lattice
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
colnames(all)
##  [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"        
##  [6] "Age"         "SibSp"       "Parch"       "Ticket"      "Fare"       
## [11] "Cabin"       "Embarked"    "fam.size"    "miss.age"    "master"     
## [16] "mr"          "mrs"         "miss"
aggr(all, numbers=TRUE, prop=c(TRUE,FALSE))

# Exclude certain variables from multiple imputation
Matrix = mice(all, maxit=0) 
## Warning: Number of logged events: 1
predM = Matrix$predictorMatrix
predM[, c("PassengerId", "Name", "Ticket", "Cabin", "miss.age", "fam.size")]=0   

imp<-mice(all, predictorMatrix=predM, m=5)
## 
##  iter imp variable
##   1   1  Age  Fare
##   1   2  Age  Fare
##   1   3  Age  Fare
##   1   4  Age  Fare
##   1   5  Age  Fare
##   2   1  Age  Fare
##   2   2  Age  Fare
##   2   3  Age  Fare
##   2   4  Age  Fare
##   2   5  Age  Fare
##   3   1  Age  Fare
##   3   2  Age  Fare
##   3   3  Age  Fare
##   3   4  Age  Fare
##   3   5  Age  Fare
##   4   1  Age  Fare
##   4   2  Age  Fare
##   4   3  Age  Fare
##   4   4  Age  Fare
##   4   5  Age  Fare
##   5   1  Age  Fare
##   5   2  Age  Fare
##   5   3  Age  Fare
##   5   4  Age  Fare
##   5   5  Age  Fare
# Get the final data-frame with imputed values filled in 'Age'
imputed <- complete(imp)
dat.train<-imputed[1:891,1:13]
dat.test <-imputed[892:1309,1:13]


library(tree)
tree<-tree(Survived ~ Pclass + Sex + Age + Parch + Fare + Embarked + fam.size, data=dat.train, control = tree.control(nrow(dat.train), mincut = 10, minsize = 30, mindev = 0.01)) 
# [1] 0.04576097
#tree<-tree(Survived ~ Pclass + Sex + Age + Parch + Fare + Embarked + fam.size, data=dat.train)

predict.tree <- predict(tree, dat.test)

# Flag any predicted probability > .05 as survived
dat.test$Survived<-0
dat.test$Survived[predict.tree > 0.5] <- 1

# MSE
mean((predict.tree-dat.test$Survived)^2)
## [1] 0.04863142
#tree.out <- t.test[,1:2]
#write.csv(tree.out, file="//Users/sofia/Desktop/SPRING 2019/488/Titanic/tree predictions.csv")