library(mice); library(tree)
## Loading required package: lattice
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
train <- read.csv("/Users/sofia/Desktop/MS/SPRING 2019/488/Titanic/train.csv")
test <- read.csv("/Users/sofia/Desktop/MS/SPRING 2019/488/Titanic/test.csv")
test$Survived <- "NA"
all <- rbind(train, test)
# Sibling size categories (0, 1-2, 3+)
all$sib.size <- 0
all$sib.size[all$SibSp > 0] <- 1
all$sib.size[all$SibSp >= 3] <- 2
# flag missing ages so I can do a reasonability check later
all$miss.age <- 1
all$miss.age[all$Age > 0] <- 0
# T/F flag to ensure that passengers with "Master" in their name have imputed age < 18
# T/F flag for Mr./Mrs. We want most of those imputed ages to at least be over > 18
chars <- all$Name
value <- "Master"; mr <- "Mr."; mrs <- "Mrs."; miss <- "Miss."
master.flag<-grepl(value, chars); mr.flag<-grepl(mr, chars); mrs.flag<-grepl(mrs, chars); miss.flag<-grepl(miss, chars)
all$master<-master.flag; all$mr<-mr.flag; all$mrs<-mrs.flag; all$miss<-miss.flag
# First imputation with title flags excluded from predictor matrix
Matrix1 = mice(all, maxit=0)
## Warning: Number of logged events: 1
predM1 = Matrix1$predictorMatrix
predM1[, c("PassengerId", "Name", "Ticket", "Cabin", "miss.age", "sib.size", "master", "mr", "miss", "mrs")]=0
imp1<-mice(all, predictorMatrix=predM1, m=5)
##
## iter imp variable
## 1 1 Age Fare
## 1 2 Age Fare
## 1 3 Age Fare
## 1 4 Age Fare
## 1 5 Age Fare
## 2 1 Age Fare
## 2 2 Age Fare
## 2 3 Age Fare
## 2 4 Age Fare
## 2 5 Age Fare
## 3 1 Age Fare
## 3 2 Age Fare
## 3 3 Age Fare
## 3 4 Age Fare
## 3 5 Age Fare
## 4 1 Age Fare
## 4 2 Age Fare
## 4 3 Age Fare
## 4 4 Age Fare
## 4 5 Age Fare
## 5 1 Age Fare
## 5 2 Age Fare
## 5 3 Age Fare
## 5 4 Age Fare
## 5 5 Age Fare
imputed.v1 <- complete(imp1)
# Second imputation, predictor matrix now includes the title flags
Matrix2 = mice(all, maxit=0)
## Warning: Number of logged events: 1
predM2 = Matrix2$predictorMatrix
predM2[, c("PassengerId", "Name", "Ticket", "Cabin", "miss.age", "sib.size")]=0
imp2<-mice(all, predictorMatrix=predM2, m=5)
##
## iter imp variable
## 1 1 Age Fare
## 1 2 Age Fare
## 1 3 Age Fare
## 1 4 Age Fare
## 1 5 Age Fare
## 2 1 Age Fare
## 2 2 Age Fare
## 2 3 Age Fare
## 2 4 Age Fare
## 2 5 Age Fare
## 3 1 Age Fare
## 3 2 Age Fare
## 3 3 Age Fare
## 3 4 Age Fare
## 3 5 Age Fare
## 4 1 Age Fare
## 4 2 Age Fare
## 4 3 Age Fare
## 4 4 Age Fare
## 4 5 Age Fare
## 5 1 Age Fare
## 5 2 Age Fare
## 5 3 Age Fare
## 5 4 Age Fare
## 5 5 Age Fare
imputed.v2 <- complete(imp2)
Master Plots
# Visualization for MASTER, imputation 1
master.imp1 <- subset(imputed.v1, master=="TRUE")
master.imp1$imp.flag <- "black"
master.imp1$imp.flag [master.imp1$miss.age > 0] <- "deeppink2"
par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
plot(master.imp1$Age, col=master.imp1$imp.flag, ylab="Original and Imputed Ages", main="Passengers with 'Master' Title - Imputation #1", pch=8)
legend("topright", inset=c(-0.35,0),legend=c("True Age", "Imputed Age"), pch=8, col=c("black", "deeppink2"),cex=0.9)
# Visualization for MASTER, imputation 2
master.imp2 <- subset(imputed.v2, master=="TRUE")
master.imp2$imp.flag <- "black"
master.imp2$imp.flag [master.imp2$miss.age > 0] <- "deeppink2"
par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
plot(master.imp2$Age, col=master.imp2$imp.flag, ylab="Original and Imputed Ages", main="Passengers with 'Master' Title - Imputation #2", pch=8)
legend("topright", inset=c(-0.35,0),legend=c("True Age", "Imputed Age"), pch=8, col=c("black", "deeppink2"),cex=0.9)
Mr Plots
# Visualization for MR, imputation 1
mr.imp1 <- subset(imputed.v1, mr=="TRUE")
mr.imp1$imp.flag <- "black"
mr.imp1$imp.flag [mr.imp1$miss.age > 0] <- "purple"
par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
plot(mr.imp1$Age, col=mr.imp1$imp.flag, ylab="Original and Imputed Ages", main="Passengers with 'Mr' Title - Imputation #1", pch=8)
legend("topright", inset=c(-0.35,0),legend=c("True Age", "Imputed Age"), pch=8, col=c("black", "purple"),cex=0.9)
# Visualization for MR, imputation 2
mr.imp2 <- subset(imputed.v2, mr=="TRUE")
mr.imp2$imp.flag <- "black"
mr.imp2$imp.flag [mr.imp2$miss.age > 0] <- "purple"
par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
plot(mr.imp2$Age, col=mr.imp2$imp.flag, ylab="Original and Imputed Ages", main="Passengers with 'Mr' Title - Imputation #2", pch=8)
legend("topright", inset=c(-0.35,0),legend=c("True Age", "Imputed Age"), pch=8, col=c("black", "purple"),cex=0.9)
Mrs Plots
# Visualization for MRS, imputation 1
mrs.imp1 <- subset(imputed.v1, mrs=="TRUE")
mrs.imp1$imp.flag <- "black"
mrs.imp1$imp.flag [mrs.imp1$miss.age > 0] <- "green3"
par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
plot(mrs.imp1$Age, col=mrs.imp1$imp.flag, ylab="Original and Imputed Ages", main="Passengers with 'Mrs' Title - Imputation #1", pch=8)
legend("topright", inset=c(-0.35,0),legend=c("True Age", "Imputed Age"), pch=8, col=c("black", "green3"),cex=0.9)
# Visualization for MRS, imputation 2
mrs.imp2 <- subset(imputed.v2, mrs=="TRUE")
mrs.imp2$imp.flag <- "black"
mrs.imp2$imp.flag [mrs.imp1$miss.age > 0] <- "green3"
par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
plot(mrs.imp2$Age, col=mrs.imp2$imp.flag, ylab="Original and Imputed Ages", main="Passengers with 'Mrs' Title - Imputation #2", pch=8)
legend("topright", inset=c(-0.35,0),legend=c("True Age", "Imputed Age"), pch=8, col=c("black", "green3"),cex=0.9)