titanic <- read.csv("train.csv", header=T,na.strings=c("","NA"))
test<- read.csv("test.csv")

library(ISLR)
library(rpart)
library(tree)
library(rpart.plot)
library(dplyr)
library(tidyr)
library(ggplot2)
library(caret)

Summary Statistics

summary(titanic)
##   PassengerId       Survived          Pclass     
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :446.0   Median :0.0000   Median :3.000  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309  
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000  
##                                                  
##                                     Name         Sex           Age       
##  Abbing, Mr. Anthony                  :  1   female:314   Min.   : 0.42  
##  Abbott, Mr. Rossmore Edward          :  1   male  :577   1st Qu.:20.12  
##  Abbott, Mrs. Stanton (Rosa Hunt)     :  1                Median :28.00  
##  Abelson, Mr. Samuel                  :  1                Mean   :29.70  
##  Abelson, Mrs. Samuel (Hannah Wizosky):  1                3rd Qu.:38.00  
##  Adahl, Mr. Mauritz Nils Martin       :  1                Max.   :80.00  
##  (Other)                              :885                NA's   :177    
##      SibSp           Parch             Ticket         Fare       
##  Min.   :0.000   Min.   :0.0000   1601    :  7   Min.   :  0.00  
##  1st Qu.:0.000   1st Qu.:0.0000   347082  :  7   1st Qu.:  7.91  
##  Median :0.000   Median :0.0000   CA. 2343:  7   Median : 14.45  
##  Mean   :0.523   Mean   :0.3816   3101295 :  6   Mean   : 32.20  
##  3rd Qu.:1.000   3rd Qu.:0.0000   347088  :  6   3rd Qu.: 31.00  
##  Max.   :8.000   Max.   :6.0000   CA 2144 :  6   Max.   :512.33  
##                                   (Other) :852                   
##          Cabin     Embarked  
##  B96 B98    :  4   C   :168  
##  C23 C25 C27:  4   Q   : 77  
##  G6         :  4   S   :644  
##  C22 C26    :  3   NA's:  2  
##  D          :  3             
##  (Other)    :186             
##  NA's       :687
Lived <- sum(titanic$Survived)/nrow(titanic)
Died <- 1-Lived
rate <- cbind(Lived,Died)
as.data.frame(rate)
##       Lived      Died
## 1 0.3838384 0.6161616
posn.j <- position_jitter(0.5, 0)
ggplot(titanic,aes(x=factor(Pclass),y=Age,col=factor(Sex)))+
  geom_jitter(size=3,alpha=0.5,position=posn.j)+
  facet_grid(". ~ Survived")

Understanding which values are missing

From this plot, we see that Embarked, Age, and Cabin are missing values. I will be imputing Age (20% missing) as this is most understandably missing.

missing_values <- titanic %>% summarize_all(funs(sum(is.na(.))/n()))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
## # Before:
## funs(name = f(.)
## 
## # After: 
## list(name = ~f(.))
## This warning is displayed once per session.
missing_values <- gather(missing_values, key="feature", value="missing_pct")

missing_values %>% 
  ggplot(aes(x=reorder(feature,-missing_pct),y=missing_pct)) +
  geom_bar(stat="identity",fill="light pink")+
  coord_flip()+theme_bw()

Imputing Age using other variables to predict

missing_val <- is.na(titanic$Age)
age_train <- titanic[!missing_val, ]
age_test <- titanic[missing_val, ]

lm_Age <-lm(Age~Pclass + Survived + SibSp, data = age_train)
age_test$Age <- predict(lm_Age, newdata = age_test)
titanic[titanic$PassengerId %in% age_test$PassengerId, "Age"] <- age_test$Age

Tree Model

train<-titanic[1:450,]
test<-titanic[451:891,]
tree <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare, data=train)
rpart.plot(tree)

Finding Optimal CP Value for Tree

set.seed(1234)
titanic$fold <- sample(1:5,891,replace=TRUE)

cp <- 0.02
cp <- seq(0.01,0.02,length=20)
cpList<-list()
for (j in 1:20){print(j)
sse <- list()
for (i in 1:5){
tree_K <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare, data=titanic[!(titanic$fold == i),], control = rpart.control(cp = cp[j] ,minsplit=10,minbucket=5))

#Predicted values
yhat <- predict(tree_K,titanic[(titanic$fold == i),])
#actual Values
y <- titanic$Survived[(titanic$fold == i)]

sse[[i]]<-sum((y-yhat)^2)

}

cpList[[j]]<-sum(unlist(sse))/nrow(titanic)

}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
plot(cp,unlist(cpList),pch=16)

Tree Model with Optimal CP

tree1 <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare, data=titanic, control = rpart.control(cp = 0.012,minsplit=10,minbucket=5))
rpart.plot(tree1)

printcp(tree1)
## 
## Regression tree:
## rpart(formula = Survived ~ Pclass + Sex + Age + SibSp + Fare, 
##     data = titanic, control = rpart.control(cp = 0.012, minsplit = 10, 
##         minbucket = 5))
## 
## Variables actually used in tree construction:
## [1] Age    Fare   Pclass Sex    SibSp 
## 
## Root node error: 210.73/891 = 0.23651
## 
## n= 891 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.295231      0   1.00000 1.00243 0.016060
## 2 0.073942      1   0.70477 0.70829 0.033261
## 3 0.034029      2   0.63083 0.63487 0.031750
## 4 0.023849      4   0.56277 0.57909 0.031589
## 5 0.022270      5   0.53892 0.56424 0.032782
## 6 0.013361      6   0.51665 0.53470 0.033589
## 7 0.012000      8   0.48993 0.53803 0.033937

MSE For Tree

mean((y-yhat)^2)
## [1] 0.1291507

Random Forest

train<-titanic[1:450,]
test<-titanic[451:891,]

library(randomForest)
titanic_RF <-randomForest(Survived ~ Pclass + Sex + Age + SibSp + Fare,data=train,importance=TRUE)
plot(titanic_RF)

varImpPlot(titanic_RF)

Random Forest MSE

predicted_Survived<-predict(titanic_RF, newdata=test, OOB=T)
mean((predicted_Survived-test$Survived)^2)
## [1] 0.1259538

Creating File

test <- read.csv("test.csv")
test$Survived <- predict(titanic_RF, newdata=test)
kajalfile = cbind(test$PassengerId, test$Survived)
colnames(kajalfile) = c("PassengerID","Survived")
write.csv(kajalfile, file="KajalChokshiTitanic", fileEncoding = "macroman", row.names=FALSE)

Kaggle Score