train <- read.csv("/Users/sofia/Desktop/SPRING 2019/488/Titanic/train.csv")
test <- read.csv("/Users/sofia/Desktop/SPRING 2019/488/Titanic/test.csv")
work.train <- train
work.test <- test
work.test$Survived <- "NA"
all <- rbind(work.train, work.test)
# Family size categories (0, 1-2, 3+)
all$fam.size <- 0
all$fam.size[all$SibSp > 0] <- 1
all$fam.size[all$SibSp >= 3] <- 2
# flag missing ages so I can do a reasonability check later
all$miss.age <- 1
all$miss.age[all$Age > 0] <- 0
# T/F flag to ensure that passengers with "Master" in their name have imputed age < 18
# T/F flag for Mr./Mrs. We want all of those imputed ages to at least be over > 18
chars <- all$Name
value <- "Master"; mr <- "Mr."; mrs <- "Mrs."; miss <- "Miss."
master.flag<-grepl(value, chars); mr.flag<-grepl(mr, chars); mrs.flag<-grepl(mrs, chars); miss.flag<-grepl(miss, chars)
all$master<-master.flag; all$mr<-mr.flag; all$mrs<-mrs.flag; all$miss<-miss.flag
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(mice)
## Loading required package: lattice
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
colnames(all)
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Cabin" "Embarked" "fam.size" "miss.age" "master"
## [16] "mr" "mrs" "miss"
aggr(all, numbers=TRUE, prop=c(TRUE,FALSE))

# Exclude certain variables from multiple imputation
Matrix = mice(all, maxit=0)
## Warning: Number of logged events: 1
predM = Matrix$predictorMatrix
predM[, c("PassengerId", "Name", "Ticket", "Cabin", "miss.age", "fam.size")]=0
imp<-mice(all, predictorMatrix=predM, m=5)
##
## iter imp variable
## 1 1 Age Fare
## 1 2 Age Fare
## 1 3 Age Fare
## 1 4 Age Fare
## 1 5 Age Fare
## 2 1 Age Fare
## 2 2 Age Fare
## 2 3 Age Fare
## 2 4 Age Fare
## 2 5 Age Fare
## 3 1 Age Fare
## 3 2 Age Fare
## 3 3 Age Fare
## 3 4 Age Fare
## 3 5 Age Fare
## 4 1 Age Fare
## 4 2 Age Fare
## 4 3 Age Fare
## 4 4 Age Fare
## 4 5 Age Fare
## 5 1 Age Fare
## 5 2 Age Fare
## 5 3 Age Fare
## 5 4 Age Fare
## 5 5 Age Fare
# Get the final data-frame with imputed values filled in 'Age'
imputed <- complete(imp)
dat.train<-imputed[1:891,1:13]
dat.test <-imputed[892:1309,1:13]
library(tree)
tree<-tree(Survived ~ Pclass + Sex + Age + Parch + Fare + Embarked + fam.size, data=dat.train, control = tree.control(nrow(dat.train), mincut = 10, minsize = 30, mindev = 0.01))
# [1] 0.04576097
#tree<-tree(Survived ~ Pclass + Sex + Age + Parch + Fare + Embarked + fam.size, data=dat.train)
predict.tree <- predict(tree, dat.test)
# Flag any predicted probability > .05 as survived
dat.test$Survived<-0
dat.test$Survived[predict.tree > 0.5] <- 1
# MSE
mean((predict.tree-dat.test$Survived)^2)
## [1] 0.04863142
#tree.out <- t.test[,1:2]
#write.csv(tree.out, file="//Users/sofia/Desktop/SPRING 2019/488/Titanic/tree predictions.csv")