titanic <- read.csv("train.csv", header=T,na.strings=c("","NA"))
test<- read.csv("test.csv")

library(ISLR)
library(rpart)
library(tree)
library(rpart.plot)
library(dplyr)
library(tidyr)
library(ggplot2)
library(caret)

Summary Statistics

summary(titanic)

##   PassengerId       Survived          Pclass     
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :446.0   Median :0.0000   Median :3.000  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309  
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000  
##                                                  
##                                     Name         Sex           Age       
##  Abbing, Mr. Anthony                  :  1   female:314   Min.   : 0.42  
##  Abbott, Mr. Rossmore Edward          :  1   male  :577   1st Qu.:20.12  
##  Abbott, Mrs. Stanton (Rosa Hunt)     :  1                Median :28.00  
##  Abelson, Mr. Samuel                  :  1                Mean   :29.70  
##  Abelson, Mrs. Samuel (Hannah Wizosky):  1                3rd Qu.:38.00  
##  Adahl, Mr. Mauritz Nils Martin       :  1                Max.   :80.00  
##  (Other)                              :885                NA's   :177    
##      SibSp           Parch             Ticket         Fare       
##  Min.   :0.000   Min.   :0.0000   1601    :  7   Min.   :  0.00  
##  1st Qu.:0.000   1st Qu.:0.0000   347082  :  7   1st Qu.:  7.91  
##  Median :0.000   Median :0.0000   CA. 2343:  7   Median : 14.45  
##  Mean   :0.523   Mean   :0.3816   3101295 :  6   Mean   : 32.20  
##  3rd Qu.:1.000   3rd Qu.:0.0000   347088  :  6   3rd Qu.: 31.00  
##  Max.   :8.000   Max.   :6.0000   CA 2144 :  6   Max.   :512.33  
##                                   (Other) :852                   
##          Cabin     Embarked  
##  B96 B98    :  4   C   :168  
##  C23 C25 C27:  4   Q   : 77  
##  G6         :  4   S   :644  
##  C22 C26    :  3   NA's:  2  
##  D          :  3             
##  (Other)    :186             
##  NA's       :687

Lived <- sum(titanic$Survived)/nrow(titanic)
Died <- 1-Lived
rate <- cbind(Lived,Died)
as.data.frame(rate)

##       Lived      Died
## 1 0.3838384 0.6161616

posn.j <- position_jitter(0.5, 0)
ggplot(titanic,aes(x=factor(Pclass),y=Age,col=factor(Sex)))+
  geom_jitter(size=3,alpha=0.5,position=posn.j)+
  facet_grid(". ~ Survived")

Understanding which values are missing

From this plot, we see that Embarked, Age, and Cabin are missing values. I will be imputing Age (20% missing) as this is most understandably missing.

missing_values <- titanic %>% summarize_all(funs(sum(is.na(.))/n()))

## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
## # Before:
## funs(name = f(.)
## 
## # After: 
## list(name = ~f(.))
## This warning is displayed once per session.

missing_values <- gather(missing_values, key="feature", value="missing_pct")

missing_values %>% 
  ggplot(aes(x=reorder(feature,-missing_pct),y=missing_pct)) +
  geom_bar(stat="identity",fill="light pink")+
  coord_flip()+theme_bw()

Imputing Age using other variables to predict

missing_val <- is.na(titanic$Age)
age_train <- titanic[!missing_val, ]
age_test <- titanic[missing_val, ]

lm_Age <-lm(Age~Pclass + Survived + SibSp, data = age_train)
age_test$Age <- predict(lm_Age, newdata = age_test)
titanic[titanic$PassengerId %in% age_test$PassengerId, "Age"] <- age_test$Age

Tree Model

train<-titanic[1:450,]
test<-titanic[451:891,]
tree <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare, data=train)
rpart.plot(tree)

Finding Optimal CP Value for Tree

set.seed(1234)
titanic$fold <- sample(1:5,891,replace=TRUE)

cp <- 0.02
cp <- seq(0.01,0.02,length=20)
cpList<-list()
for (j in 1:20){print(j)
sse <- list()
for (i in 1:5){
tree_K <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare, data=titanic[!(titanic$fold == i),], control = rpart.control(cp = cp[j] ,minsplit=10,minbucket=5))

#Predicted values
yhat <- predict(tree_K,titanic[(titanic$fold == i),])
#actual Values
y <- titanic$Survived[(titanic$fold == i)]

sse[[i]]<-sum((y-yhat)^2)

}

cpList[[j]]<-sum(unlist(sse))/nrow(titanic)

}

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20

plot(cp,unlist(cpList),pch=16)

Tree Model with Optimal CP

tree1 <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare, data=titanic, control = rpart.control(cp = 0.012,minsplit=10,minbucket=5))
rpart.plot(tree1)

printcp(tree1)

## 
## Regression tree:
## rpart(formula = Survived ~ Pclass + Sex + Age + SibSp + Fare, 
##     data = titanic, control = rpart.control(cp = 0.012, minsplit = 10, 
##         minbucket = 5))
## 
## Variables actually used in tree construction:
## [1] Age    Fare   Pclass Sex    SibSp 
## 
## Root node error: 210.73/891 = 0.23651
## 
## n= 891 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.295231      0   1.00000 1.00243 0.016060
## 2 0.073942      1   0.70477 0.70829 0.033261
## 3 0.034029      2   0.63083 0.63487 0.031750
## 4 0.023849      4   0.56277 0.57909 0.031589
## 5 0.022270      5   0.53892 0.56424 0.032782
## 6 0.013361      6   0.51665 0.53470 0.033589
## 7 0.012000      8   0.48993 0.53803 0.033937

MSE For Tree

mean((y-yhat)^2)

## [1] 0.1291507

Random Forest

train<-titanic[1:450,]
test<-titanic[451:891,]

library(randomForest)
titanic_RF <-randomForest(Survived ~ Pclass + Sex + Age + SibSp + Fare,data=train,importance=TRUE)
plot(titanic_RF)

varImpPlot(titanic_RF)

Random Forest MSE

predicted_Survived<-predict(titanic_RF, newdata=test, OOB=T)
mean((predicted_Survived-test$Survived)^2)

## [1] 0.1259538

Creating File

test <- read.csv("test.csv")
test$Survived <- predict(titanic_RF, newdata=test)

kajalfile = cbind(test$PassengerId, test$Survived)
colnames(kajalfile) = c("PassengerID","Survived")
write.csv(kajalfile, file="KajalChokshiTitanic", fileEncoding = "macroman", row.names=FALSE)

STAT 488 Titanic

Kajal Chokshi

5/3/2019

Summary Statistics

Understanding which values are missing

Imputing Age using other variables to predict

Tree Model

Finding Optimal CP Value for Tree

Tree Model with Optimal CP

MSE For Tree

Random Forest

Random Forest MSE

Creating File

Kaggle Score