titanic <- read.csv("train.csv", header=T,na.strings=c("","NA"))
test<- read.csv("test.csv")
library(ISLR)
library(rpart)
library(tree)
library(rpart.plot)
library(dplyr)
library(tidyr)
library(ggplot2)
library(caret)
summary(titanic)
## PassengerId Survived Pclass
## Min. : 1.0 Min. :0.0000 Min. :1.000
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000
## Median :446.0 Median :0.0000 Median :3.000
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Name Sex Age
## Abbing, Mr. Anthony : 1 female:314 Min. : 0.42
## Abbott, Mr. Rossmore Edward : 1 male :577 1st Qu.:20.12
## Abbott, Mrs. Stanton (Rosa Hunt) : 1 Median :28.00
## Abelson, Mr. Samuel : 1 Mean :29.70
## Abelson, Mrs. Samuel (Hannah Wizosky): 1 3rd Qu.:38.00
## Adahl, Mr. Mauritz Nils Martin : 1 Max. :80.00
## (Other) :885 NA's :177
## SibSp Parch Ticket Fare
## Min. :0.000 Min. :0.0000 1601 : 7 Min. : 0.00
## 1st Qu.:0.000 1st Qu.:0.0000 347082 : 7 1st Qu.: 7.91
## Median :0.000 Median :0.0000 CA. 2343: 7 Median : 14.45
## Mean :0.523 Mean :0.3816 3101295 : 6 Mean : 32.20
## 3rd Qu.:1.000 3rd Qu.:0.0000 347088 : 6 3rd Qu.: 31.00
## Max. :8.000 Max. :6.0000 CA 2144 : 6 Max. :512.33
## (Other) :852
## Cabin Embarked
## B96 B98 : 4 C :168
## C23 C25 C27: 4 Q : 77
## G6 : 4 S :644
## C22 C26 : 3 NA's: 2
## D : 3
## (Other) :186
## NA's :687
Lived <- sum(titanic$Survived)/nrow(titanic)
Died <- 1-Lived
rate <- cbind(Lived,Died)
as.data.frame(rate)
## Lived Died
## 1 0.3838384 0.6161616
posn.j <- position_jitter(0.5, 0)
ggplot(titanic,aes(x=factor(Pclass),y=Age,col=factor(Sex)))+
geom_jitter(size=3,alpha=0.5,position=posn.j)+
facet_grid(". ~ Survived")
From this plot, we see that Embarked, Age, and Cabin are missing values. I will be imputing Age (20% missing) as this is most understandably missing.
missing_values <- titanic %>% summarize_all(funs(sum(is.na(.))/n()))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
##
## # Before:
## funs(name = f(.)
##
## # After:
## list(name = ~f(.))
## This warning is displayed once per session.
missing_values <- gather(missing_values, key="feature", value="missing_pct")
missing_values %>%
ggplot(aes(x=reorder(feature,-missing_pct),y=missing_pct)) +
geom_bar(stat="identity",fill="light pink")+
coord_flip()+theme_bw()
missing_val <- is.na(titanic$Age)
age_train <- titanic[!missing_val, ]
age_test <- titanic[missing_val, ]
lm_Age <-lm(Age~Pclass + Survived + SibSp, data = age_train)
age_test$Age <- predict(lm_Age, newdata = age_test)
titanic[titanic$PassengerId %in% age_test$PassengerId, "Age"] <- age_test$Age
train<-titanic[1:450,]
test<-titanic[451:891,]
tree <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare, data=train)
rpart.plot(tree)
set.seed(1234)
titanic$fold <- sample(1:5,891,replace=TRUE)
cp <- 0.02
cp <- seq(0.01,0.02,length=20)
cpList<-list()
for (j in 1:20){print(j)
sse <- list()
for (i in 1:5){
tree_K <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare, data=titanic[!(titanic$fold == i),], control = rpart.control(cp = cp[j] ,minsplit=10,minbucket=5))
#Predicted values
yhat <- predict(tree_K,titanic[(titanic$fold == i),])
#actual Values
y <- titanic$Survived[(titanic$fold == i)]
sse[[i]]<-sum((y-yhat)^2)
}
cpList[[j]]<-sum(unlist(sse))/nrow(titanic)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
plot(cp,unlist(cpList),pch=16)
tree1 <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Fare, data=titanic, control = rpart.control(cp = 0.012,minsplit=10,minbucket=5))
rpart.plot(tree1)
printcp(tree1)
##
## Regression tree:
## rpart(formula = Survived ~ Pclass + Sex + Age + SibSp + Fare,
## data = titanic, control = rpart.control(cp = 0.012, minsplit = 10,
## minbucket = 5))
##
## Variables actually used in tree construction:
## [1] Age Fare Pclass Sex SibSp
##
## Root node error: 210.73/891 = 0.23651
##
## n= 891
##
## CP nsplit rel error xerror xstd
## 1 0.295231 0 1.00000 1.00243 0.016060
## 2 0.073942 1 0.70477 0.70829 0.033261
## 3 0.034029 2 0.63083 0.63487 0.031750
## 4 0.023849 4 0.56277 0.57909 0.031589
## 5 0.022270 5 0.53892 0.56424 0.032782
## 6 0.013361 6 0.51665 0.53470 0.033589
## 7 0.012000 8 0.48993 0.53803 0.033937
mean((y-yhat)^2)
## [1] 0.1291507
train<-titanic[1:450,]
test<-titanic[451:891,]
library(randomForest)
titanic_RF <-randomForest(Survived ~ Pclass + Sex + Age + SibSp + Fare,data=train,importance=TRUE)
plot(titanic_RF)
varImpPlot(titanic_RF)
predicted_Survived<-predict(titanic_RF, newdata=test, OOB=T)
mean((predicted_Survived-test$Survived)^2)
## [1] 0.1259538
test <- read.csv("test.csv")
test$Survived <- predict(titanic_RF, newdata=test)
kajalfile = cbind(test$PassengerId, test$Survived)
colnames(kajalfile) = c("PassengerID","Survived")
write.csv(kajalfile, file="KajalChokshiTitanic", fileEncoding = "macroman", row.names=FALSE)