Read in data and create flags

library(mice); library(tree)
## Loading required package: lattice
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
train <- read.csv("/Users/sofia/Desktop/MS/SPRING 2019/488/Titanic/train.csv")
test <- read.csv("/Users/sofia/Desktop/MS/SPRING 2019/488/Titanic/test.csv")

test$Survived <- "NA"
all <- rbind(train, test)


# Sibling size categories (0, 1-2, 3+)
all$sib.size <- 0
all$sib.size[all$SibSp > 0] <- 1
all$sib.size[all$SibSp >= 3] <- 2

# flag missing ages so I can do a reasonability check later
all$miss.age <- 1
all$miss.age[all$Age > 0] <- 0

# T/F flag to ensure that passengers with "Master" in their name have imputed age < 18
# T/F flag for Mr./Mrs. We want most of those imputed ages to at least be over > 18
chars <- all$Name
value <- "Master"; mr <- "Mr."; mrs <- "Mrs."; miss <- "Miss."
master.flag<-grepl(value, chars); mr.flag<-grepl(mr, chars); mrs.flag<-grepl(mrs, chars); miss.flag<-grepl(miss, chars)
all$master<-master.flag; all$mr<-mr.flag; all$mrs<-mrs.flag; all$miss<-miss.flag

Perform first imputation which did not include any of the RegEx flags for master, miss, mr, mrs.

# First imputation with title flags excluded from predictor matrix
Matrix1 = mice(all, maxit=0) 
## Warning: Number of logged events: 1
predM1 = Matrix1$predictorMatrix
predM1[, c("PassengerId", "Name", "Ticket", "Cabin", "miss.age", "sib.size", "master", "mr", "miss", "mrs")]=0   


imp1<-mice(all, predictorMatrix=predM1, m=5)
## 
##  iter imp variable
##   1   1  Age  Fare
##   1   2  Age  Fare
##   1   3  Age  Fare
##   1   4  Age  Fare
##   1   5  Age  Fare
##   2   1  Age  Fare
##   2   2  Age  Fare
##   2   3  Age  Fare
##   2   4  Age  Fare
##   2   5  Age  Fare
##   3   1  Age  Fare
##   3   2  Age  Fare
##   3   3  Age  Fare
##   3   4  Age  Fare
##   3   5  Age  Fare
##   4   1  Age  Fare
##   4   2  Age  Fare
##   4   3  Age  Fare
##   4   4  Age  Fare
##   4   5  Age  Fare
##   5   1  Age  Fare
##   5   2  Age  Fare
##   5   3  Age  Fare
##   5   4  Age  Fare
##   5   5  Age  Fare
imputed.v1 <- complete(imp1)

Perform second imputation which includes master, miss, mr, mrs flags in the MI predictor matrix.

# Second imputation, predictor matrix now includes the title flags
Matrix2 = mice(all, maxit=0) 
## Warning: Number of logged events: 1
predM2 = Matrix2$predictorMatrix
predM2[, c("PassengerId", "Name", "Ticket", "Cabin", "miss.age", "sib.size")]=0   

imp2<-mice(all, predictorMatrix=predM2, m=5)
## 
##  iter imp variable
##   1   1  Age  Fare
##   1   2  Age  Fare
##   1   3  Age  Fare
##   1   4  Age  Fare
##   1   5  Age  Fare
##   2   1  Age  Fare
##   2   2  Age  Fare
##   2   3  Age  Fare
##   2   4  Age  Fare
##   2   5  Age  Fare
##   3   1  Age  Fare
##   3   2  Age  Fare
##   3   3  Age  Fare
##   3   4  Age  Fare
##   3   5  Age  Fare
##   4   1  Age  Fare
##   4   2  Age  Fare
##   4   3  Age  Fare
##   4   4  Age  Fare
##   4   5  Age  Fare
##   5   1  Age  Fare
##   5   2  Age  Fare
##   5   3  Age  Fare
##   5   4  Age  Fare
##   5   5  Age  Fare
imputed.v2 <- complete(imp2)

Graphs to visualize differences in imputed values when title flags are excluded/included from the MI predictor matrix.

Master Plots

# Visualization for MASTER, imputation 1
master.imp1 <- subset(imputed.v1, master=="TRUE")
master.imp1$imp.flag <- "black"
master.imp1$imp.flag [master.imp1$miss.age > 0] <- "deeppink2"

par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
plot(master.imp1$Age, col=master.imp1$imp.flag, ylab="Original and Imputed Ages", main="Passengers with 'Master' Title - Imputation #1",  pch=8)
legend("topright", inset=c(-0.35,0),legend=c("True Age", "Imputed Age"), pch=8, col=c("black", "deeppink2"),cex=0.9)

# Visualization for MASTER, imputation 2
master.imp2 <- subset(imputed.v2, master=="TRUE")
master.imp2$imp.flag <- "black"
master.imp2$imp.flag [master.imp2$miss.age > 0] <- "deeppink2"

par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
plot(master.imp2$Age, col=master.imp2$imp.flag, ylab="Original and Imputed Ages", main="Passengers with 'Master' Title - Imputation #2", pch=8)
legend("topright", inset=c(-0.35,0),legend=c("True Age", "Imputed Age"), pch=8, col=c("black", "deeppink2"),cex=0.9)


Mr Plots

# Visualization for MR, imputation 1
mr.imp1 <- subset(imputed.v1, mr=="TRUE")
mr.imp1$imp.flag <- "black"
mr.imp1$imp.flag [mr.imp1$miss.age > 0] <- "purple"

par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
plot(mr.imp1$Age, col=mr.imp1$imp.flag, ylab="Original and Imputed Ages", main="Passengers with 'Mr' Title - Imputation #1", pch=8)
legend("topright", inset=c(-0.35,0),legend=c("True Age", "Imputed Age"), pch=8, col=c("black", "purple"),cex=0.9)

# Visualization for MR, imputation 2
mr.imp2 <- subset(imputed.v2, mr=="TRUE")
mr.imp2$imp.flag <- "black"
mr.imp2$imp.flag [mr.imp2$miss.age > 0] <- "purple"

par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
plot(mr.imp2$Age, col=mr.imp2$imp.flag, ylab="Original and Imputed Ages", main="Passengers with 'Mr' Title - Imputation #2", pch=8)
legend("topright", inset=c(-0.35,0),legend=c("True Age", "Imputed Age"), pch=8, col=c("black", "purple"),cex=0.9)


Mrs Plots

# Visualization for MRS, imputation 1
mrs.imp1 <- subset(imputed.v1, mrs=="TRUE")
mrs.imp1$imp.flag <- "black"
mrs.imp1$imp.flag [mrs.imp1$miss.age > 0] <- "green3"

par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
plot(mrs.imp1$Age, col=mrs.imp1$imp.flag, ylab="Original and Imputed Ages", main="Passengers with 'Mrs' Title - Imputation #1", pch=8)
legend("topright", inset=c(-0.35,0),legend=c("True Age", "Imputed Age"), pch=8, col=c("black", "green3"),cex=0.9)

# Visualization for MRS, imputation 2
mrs.imp2 <- subset(imputed.v2, mrs=="TRUE")
mrs.imp2$imp.flag <- "black"
mrs.imp2$imp.flag [mrs.imp1$miss.age > 0] <- "green3"

par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
plot(mrs.imp2$Age, col=mrs.imp2$imp.flag, ylab="Original and Imputed Ages", main="Passengers with 'Mrs' Title - Imputation #2", pch=8)
legend("topright", inset=c(-0.35,0),legend=c("True Age", "Imputed Age"), pch=8, col=c("black", "green3"),cex=0.9)