RMS Titanic was a British passenger liner that sank in the North Atlantic Ocean in the early hours of 15 April 1912, after colliding with an iceberg during her maiden voyage from Southampton to New York City. There were an estimated 2,224 passengers and crew aboard, and more than 1,500 died, making it one of the deadliest commercial peacetime maritime disasters in modern history. RMS Titanic was the largest ship afloat at the time she entered service and was the second of three Olympic-class ocean liners operated by the White Star Line. She was built by the Harland and Wolff shipyard in Belfast. Thomas Andrews, chief naval architect of the shipyard at the time, died in the disaster.
test<-read.csv("F:/Data science/practice/titanic/test.csv")
train<-read.csv("F:/Data science/practice/titanic/train.csv")
library(dplyr)
library(tidyr)
library(ggplot2)
library(caret)
library(rpart)
library(rpart.plot)
test$Survived<-NA
test$set<-"test"
train$set<-"train"
full<-rbind(train,test)
str(full)
## 'data.frame': 1309 obs. of 13 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 187 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
## $ set : chr "train" "train" "train" "train" ...
lapply(full,function(x){length(unique(x))})
## $PassengerId
## [1] 1309
##
## $Survived
## [1] 3
##
## $Pclass
## [1] 3
##
## $Name
## [1] 1307
##
## $Sex
## [1] 2
##
## $Age
## [1] 99
##
## $SibSp
## [1] 7
##
## $Parch
## [1] 8
##
## $Ticket
## [1] 929
##
## $Fare
## [1] 282
##
## $Cabin
## [1] 187
##
## $Embarked
## [1] 4
##
## $set
## [1] 2
missing_values<-summarise_all(full,funs(sum(is.na(.))/n()))
missing_values<-gather(missing_values,key="feature",value = "missing_pt")
g<-ggplot(data=missing_values,aes(x=reorder(feature,-missing_pt),y=missing_pt))
g<-g+geom_bar(stat="identity",fill="blue")+coord_flip()+ylab("Feature")+xlab("Missing point")
g
full<-mutate(full,Age=ifelse(is.na(Age),mean(full$Age,na.rm = TRUE),Age),
'Age_Group'=case_when(Age<13 ~ "Age.012",
Age>=13 & Age<18 ~ "Age1317",
Age>=18 & Age<60 ~ "Age1859",
Age>=60 ~ "Age60v"
))
names<-full$Name
title<-gsub("^.*, (.*?)\\..*$", "\\1", names)
full$title<-title
table(full$title)
##
## Capt Col Don Dona Dr
## 1 4 1 1 8
## Jonkheer Lady Major Master Miss
## 1 1 2 61 260
## Mlle Mme Mr Mrs Ms
## 2 1 757 197 2
## Rev Sir the Countess
## 8 1 1
full$title[full$title=="Dona"]<-"Miss"
full$title[full$title=="Mme"]<-"Miss"
full$title[full$title=="Ms"]<-"Miss"
full$title[full$title=="Mlle"]<-"Miss"
full$title[full$title=="Lady"]<-"Miss"
full$title[full$title=="Capt"]<-"Officer"
full$title[full$title=="Col"]<-"Officer"
full$title[full$title=="Major"]<-"Officer"
full$title[full$title=="Dr"]<-"Officer"
full$title[full$title=="Rev"]<-"Officer"
full$title[full$title=="Don"]<-"Officer"
full$title[full$title=="Sir"]<-"Officer"
full$title[full$title=="the Countess"]<-"Officer"
full$title[full$title=="Jonkheer"]<-"Officer"
table(full$title)
##
## Master Miss Mr Mrs Officer
## 61 267 757 197 27
full$family_group<-full$SibSp+full$Parch+1
full$family_group[full$family_group==1]<-"Single"
full$family_group[full$family_group>=2 & full$family_group<5]<-"Small"
full$family_group[full$family_group=="5"]<-"Big"
full$family_group[full$family_group=="6"]<-"Big"
full$family_group[full$family_group=="8"]<-"Big"
full$family_group[full$family_group=="7"]<-"Big"
full$family_group[full$family_group=="11"]<-"Big"
full$family_group<-as.factor(full$family_group)
unique(full$family_group)
## [1] Small Single Big
## Levels: Big Single Small
ticket.unique <- rep(0, nrow(full))
tickets <- unique(full$Ticket)
for (i in 1:length(tickets)) {
current.ticket <- tickets[i]
party.indexes <- which(full$Ticket == current.ticket)
for (k in 1:length(party.indexes)) {
ticket.unique[party.indexes[k]]<-length(party.indexes)
}
}
full$ticket.unique <- ticket.unique
full$ticket.size[full$ticket.unique == 1]<-'Single'
full$ticket.size[full$ticket.unique < 5 & full$ticket.unique>= 2]<-'Small'
full$ticket.size[full$ticket.unique >= 5]<-'Big'
unique(full$ticket.size)
## [1] "Single" "Small" "Big"
full<-mutate(full,Survived=case_when(Survived==1 ~ "Yes", Survived==0 ~ "No"))
##Creating crude summary.
crude_summary<-full %>%
filter(set=="train") %>%
select(PassengerId,Survived) %>%
group_by(Survived) %>%
summarise(n=n()) %>%
mutate(freq=n/sum(n))
crude_survrate<-crude_summary$freq[crude_summary$Survived=="Yes"]
table(crude_summary)
## , , freq = 0.383838383838384
##
## n
## Survived 342 549
## No 0 0
## Yes 1 0
##
## , , freq = 0.616161616161616
##
## n
## Survived 342 549
## No 0 1
## Yes 0 0
sample<-full[1:891, c("Pclass", "title","Sex","Embarked","family_group","ticket.size")]
response <- as.factor(train$Survived)
sample$Survived=as.factor(train$Survived)
i=createDataPartition(sample$Survived,times=1,p=0.6,list=FALSE)
train_val=sample[i,]
test_val=sample[-i,]
round(prop.table(table(train$Survived)*100),digits = 1)
##
## 0 1
## 0.6 0.4
round(prop.table(table(train_val$Survived)*100),digits = 1)
##
## 0 1
## 0.6 0.4
round(prop.table(table(test_val$Survived)*100),digits = 1)
##
## 0 1
## 0.6 0.4
set.seed(123)
model<-rpart(Survived~.,data=train_val,method = "class")
rpart.plot(model,extra=3,fallen.leaves = T)
##Predicting now on train set only for accuracy calculation.
pred<-predict(model,data=train_val,type="class")
confusionMatrix(pred,train_val$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 294 54
## 1 36 152
##
## Accuracy : 0.8321
## 95% CI : (0.7977, 0.8628)
## No Information Rate : 0.6157
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.6393
## Mcnemar's Test P-Value : 0.07314
##
## Sensitivity : 0.8909
## Specificity : 0.7379
## Pos Pred Value : 0.8448
## Neg Pred Value : 0.8085
## Prevalence : 0.6157
## Detection Rate : 0.5485
## Detection Prevalence : 0.6493
## Balanced Accuracy : 0.8144
##
## 'Positive' Class : 0
##
predT<-predict(model,newdata = test_val,type="class")
confusionMatrix(predT,test_val$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 198 44
## 1 21 92
##
## Accuracy : 0.8169
## 95% CI : (0.7727, 0.8558)
## No Information Rate : 0.6169
## P-Value [Acc > NIR] : 2.529e-16
##
## Kappa : 0.5998
## Mcnemar's Test P-Value : 0.006357
##
## Sensitivity : 0.9041
## Specificity : 0.6765
## Pos Pred Value : 0.8182
## Neg Pred Value : 0.8142
## Prevalence : 0.6169
## Detection Rate : 0.5577
## Detection Prevalence : 0.6817
## Balanced Accuracy : 0.7903
##
## 'Positive' Class : 0
##
t<-filter(full,set=="test")
names(t)
## [1] "PassengerId" "Survived" "Pclass" "Name"
## [5] "Sex" "Age" "SibSp" "Parch"
## [9] "Ticket" "Fare" "Cabin" "Embarked"
## [13] "set" "Age_Group" "title" "family_group"
## [17] "ticket.unique" "ticket.size"
val<-predict(model,newdata=t,type="class")
submission<-as.data.frame(val)
submission[1]<-t$PassengerId
submission[2]<-val
names(submission)<-c("PassengerID","Survived")
names(submission)
## [1] "PassengerID" "Survived"
write.csv(submission,"submission.csv",row.names = F)