Survival analysis is used to predict the survival of a person,with respect to variables.
Titanic_data1=Titanic
head(Titanic_data1)
NA
Above is the head data which we are going to analyze.
summary(Titanic_data1)
PassengerId Survived Pclass
Min. : 1.0 Min. :0.0000 Min. :1.000
1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000
Median :446.0 Median :0.0000 Median :3.000
Mean :446.0 Mean :0.3838 Mean :2.309
3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
Max. :891.0 Max. :1.0000 Max. :3.000
Name Sex Age
Length:891 Length:891 Min. : 0.42
Class :character Class :character 1st Qu.:20.12
Mode :character Mode :character Median :28.00
Mean :29.70
3rd Qu.:38.00
Max. :80.00
NA's :177
SibSp Parch Ticket
Min. :0.000 Min. :0.0000 Length:891
1st Qu.:0.000 1st Qu.:0.0000 Class :character
Median :0.000 Median :0.0000 Mode :character
Mean :0.523 Mean :0.3816
3rd Qu.:1.000 3rd Qu.:0.0000
Max. :8.000 Max. :6.0000
Fare Cabin Embarked
Min. : 0.00 Length:891 Length:891
1st Qu.: 7.91 Class :character Class :character
Median : 14.45 Mode :character Mode :character
Mean : 32.20
3rd Qu.: 31.00
Max. :512.33
Above shown is the descriptive analysis of the data.
str(Titanic_data1)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 891 obs. of 12 variables:
$ PassengerId: num 1 2 3 4 5 6 7 8 9 10 ...
$ Survived : num 0 1 1 1 0 0 0 0 1 1 ...
$ Pclass : num 3 1 3 1 3 3 1 3 3 2 ...
$ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
$ Sex : chr "male" "female" "female" "female" ...
$ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
$ SibSp : num 1 1 0 1 0 0 0 3 0 1 ...
$ Parch : num 0 0 0 0 0 0 0 1 2 0 ...
$ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
$ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
$ Cabin : chr NA "C85" NA "C123" ...
$ Embarked : chr "S" "C" "S" "S" ...
Table above shows the structure of the data, which tells that ticket name ,sex,embarked,cabin are character class and the rest are numeric.
#checking for the null values
colSums(is.na(Titanic_data1))
PassengerId Survived Pclass Name Sex
0 0 0 0 0
Age SibSp Parch Ticket Fare
177 0 0 0 0
Cabin Embarked
687 2
Anove table shows that Age is having 117 null values ,Cabin is having 687 and embarked is having 2 null values ,so we havde to treat the null values.
#treating null values
#removing cabin
Titanic_data1=Titanic_data1[,-11]
#removing null values
Titanic_data1=na.omit(Titanic_data1)
As the cabin coloumn is having a lot of null values (greater than 50%) , we removed cabin from our data and for null values in age and embarked we just omit the null values.
#graphical visualization of the data
library(ggplot2)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
gr_titanic=Titanic_data1
gr_titanic$Survived=factor(gr_titanic$Survived)
qplot(Age,data = gr_titanic,geom = "density",fill=Survived,alpha=I(.7),main = "distribution of age",xlab = "Age",ylab = "density",na.rm=TRUE)
Density plot showing the people of different age survived.
# scatter plot
qplot(Age,Age,data=gr_titanic,shape=Survived,color=Survived,facets = Pclass~Sex,size=I(2),main = "survival scatterplot",na.rm=TRUE)
Above graph showing the male and female survived according to age.
#boxplot
gr_titanic$Age[is.na(gr_titanic$Age)]=mean(gr_titanic$Age,na.rm = TRUE)
ggplot(gr_titanic,aes(Age,Age,fill=Survived,na.rm=TRUE),na.rm=TRUE)+geom_boxplot()
Descriptive analysis of Age. Now moving over to analysis of our titanic data.
1)First we have to split the data into train and test data.
2)Creating a model or logistics equation for the suiting the data,considering survived as a dependent variable and rest as independent variable.
#removing some undesired variables
Titanic_data11=Titanic_data1[,-1]
Titanic_data11=Titanic_data11[,-3]
Titanic_data21=Titanic_data11[,-7]
Titanic_data3=Titanic_data21[,-7]
Titanic_data3
Now moving over to analysis of our titanic data.
1)First we have to split the data into train and test data.
2)Creating a model or logistics equation for the suiting the data,considering survived as a dependent variable and rest as independent variable.
#spliting the data using catools library
library(caTools)
split_ti=sample.split(Titanic_data3,SplitRatio = 0.7)
split_ti
[1] TRUE TRUE FALSE FALSE FALSE TRUE TRUE
Titanic_data3
train_ti=subset(Titanic_data3,split_ti=="TRUE")
Length of logical index must be 1 or 712, not 7
test_ti=subset(Titanic_data3,split_ti=="FALSE")
Length of logical index must be 1 or 712, not 7
#creating glm model
model_ti=glm(Survived~.,data = train_ti,family = binomial(link = "logit"))
summary(model_ti)
Call:
glm(formula = Survived ~ ., family = binomial(link = "logit"),
data = train_ti)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6460 -0.6327 -0.4258 0.6290 2.3769
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 5.55703 0.73871 7.523 5.37e-14 ***
Pclass -1.17220 0.19035 -6.158 7.36e-10 ***
Sexmale -2.62834 0.29128 -9.023 < 2e-16 ***
Age -0.04541 0.01090 -4.165 3.11e-05 ***
SibSp -0.20302 0.16397 -1.238 0.216
Parch -0.10820 0.14960 -0.723 0.470
EmbarkedQ -0.38294 0.71954 -0.532 0.595
EmbarkedS -0.40457 0.36088 -1.121 0.262
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 544.48 on 405 degrees of freedom
Residual deviance: 364.98 on 398 degrees of freedom
AIC: 380.98
Number of Fisher Scoring iterations: 5
As from the above table,we came to know that according to p-value all the independent variables are significant but since in logistics regression not only p-value is considered but also we have to see the residual error and AIC.
#creating model with different variables for reducing residual error and accuracy
model_ti1=glm(Survived~.-Parch ,data = train_ti,family = binomial(link = "logit"))
summary(model_ti1)#though excluding sex led to increase in the value of residual error and AIC we are not omiting it
Call:
glm(formula = Survived ~ . - Parch, family = binomial(link = "logit"),
data = train_ti)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6964 -0.6376 -0.4207 0.6434 2.3884
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 5.52434 0.73755 7.490 6.88e-14 ***
Pclass -1.18280 0.19047 -6.210 5.31e-10 ***
Sexmale -2.57598 0.28087 -9.171 < 2e-16 ***
Age -0.04582 0.01090 -4.203 2.63e-05 ***
SibSp -0.23574 0.15813 -1.491 0.136
EmbarkedQ -0.37711 0.71518 -0.527 0.598
EmbarkedS -0.40571 0.36054 -1.125 0.260
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 544.48 on 405 degrees of freedom
Residual deviance: 365.52 on 399 degrees of freedom
AIC: 379.52
Number of Fisher Scoring iterations: 5
model_ti2=glm(Survived~.-Embarked,data = train_ti,family = "binomial")
summary(model_ti2)
Call:
glm(formula = Survived ~ . - Embarked, family = "binomial", data = train_ti)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6972 -0.6423 -0.4261 0.6256 2.3674
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 5.35726 0.70860 7.560 4.02e-14 ***
Pclass -1.22952 0.18331 -6.707 1.98e-11 ***
Sexmale -2.62681 0.28889 -9.093 < 2e-16 ***
Age -0.04568 0.01086 -4.208 2.58e-05 ***
SibSp -0.20795 0.16355 -1.271 0.204
Parch -0.10888 0.14925 -0.730 0.466
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 544.48 on 405 degrees of freedom
Residual deviance: 366.23 on 400 degrees of freedom
AIC: 378.23
Number of Fisher Scoring iterations: 5
Now we get to know that after removing Parch AIC is reducing but after removing embarked value AIC and BIC are increased so we have to exclude parch only from our mode and hence “model_ti1” is best model for prediction .
pred_ti1=predict(model_ti1,test_ti,type = "response")
head(pred_ti1)
1 2 3 4 5 6
0.59358923 0.89053948 0.06852521 0.75973102 0.78211895 0.12761505
above data shows the prediction in probablity, which is not understandable,So for better undertanding we are making a table for prediction and actual value.
#confusion matrix
table_ti=table(actualvalue=test_ti$Survived,predictedvalue=pred_ti1>0.5)
table_ti
predictedvalue
actualvalue FALSE TRUE
0 155 23
1 34 94
from above confusion matrix shows the number of values truely predicted and falsely predicted
A11=TRUE NEGATIVE (truely predicted as “not survived” )
A22=TRUE POSITIVE (truely predicted as “survived”)
A12=FALSE POSITIVE (falsely predicted as “survived”)
A21=FALSE NEGATIVE (falsely predicted as “not survived”)
Now based on the set threshold we have to check its accuracy and find the best threshold value for max accuracy.
#determining accuracy of the prediction
acc=sum(diag(table_ti))/sum(table_ti)
print(paste("Accuracy of prediction =",acc*100,"%"))
[1] "Accuracy of prediction = 81.3725490196078 %"
#since accuracy is 80% we can increase it by selecting optimal threshold
library(ROCR)
Loading required package: gplots
Attaching package: 㤼㸱gplots㤼㸲
The following object is masked from 㤼㸱package:stats㤼㸲:
lowess
rocr_ti=prediction(pred_ti1,test_ti$Survived)
rocr_ti_per=performance(rocr_ti,"acc")
rocr_ti_per1=performance(rocr_ti,"tpr","fpr")
plot(rocr_ti_per1,colorize=TRUE)
from the above chart we can say that threshold must b in between 0.6to0.8. So again we are making table by setting threshold in between 0.5to0.6.
table_ti1=table(actualvalue=test_ti$Survived,predictedvalue=pred_ti1>0.60)
table_ti1
predictedvalue
actualvalue FALSE TRUE
0 162 16
1 42 86
accuracy_ti1=sum(diag(table_ti1))/sum(table_ti1)
print(paste("Accuracy of prediction =",round(accuracy_ti1*100),"%"))
[1] "Accuracy of prediction = 81 %"
So the accuracy of pprediction is 83% with threshold optimal 0.60.
Now can see the table in graphical view.
plot(table_ti1,col=c("red","green"))
The graph represents the table in the graphical view showing green(predicted as “survived”) and red(predicted as “died”)
ggplot(train_ti,aes(x=Pclass,y=Survived))+geom_point()+stat_smooth(method = "glm",method.args = list(family="binomial"),se=TRUE)
This graph shows that passenger in higher pclass are having more chances of survival.
CONCLUSION: So our model is predicting survival with the accurace of 81%.