Loading Libraries

library(MASS)
library(ggplot2)

Data exploration and Data Cleaning

Loading datset and Data exploration

train <- read.csv("titanic_train.csv")
test <- read.csv("titanic_test.csv")

test$Survived <- NA
all <- rbind(train, test)

colSums(is.na(all))
## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0         418           0           0           0         263 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           1           0           0
space = function (x) {sum(x=="") }
apply(all, 2, space)
## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0          NA           0           0           0          NA 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0          NA        1014           2

Predicting Missing Age

Now that we have 263 observations for which age is missing lets try to predict it using fare and Passenger class

ggplot(data = all, aes(x = Fare, y = Age, col = factor(Pclass))) +
  geom_smooth()

index <- which(is.na(all$Age))
predict_age <- all[index,]
train_age <- all[-index,]

lm_age <- lm(Age ~ Pclass + Fare + Pclass:Fare, data = train_age)
summary(lm_age)
## 
## Call:
## lm(formula = Age ~ Pclass + Fare + Pclass:Fare, data = train_age)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.908  -8.117  -1.360   7.775  47.876 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 45.60475    1.56595  29.123  < 2e-16 ***
## Pclass      -5.64937    0.67503  -8.369  < 2e-16 ***
## Fare         0.14181    0.02824   5.021 6.03e-07 ***
## Pclass:Fare -0.15585    0.02583  -6.033 2.23e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.88 on 1041 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2012, Adjusted R-squared:  0.1988 
## F-statistic: 87.38 on 3 and 1041 DF,  p-value: < 2.2e-16
predict_age$Age <- as.integer(predict(lm_age, newdata = predict_age))

all <- rbind(train_age, predict_age)
colSums(is.na(all))
## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0         418           0           0           0           0 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           1           0           0
all$family <- all$SibSp + all$Parch + 1

Splitting Data for survival prediction

Split data for Survival Prediction

#Convert categorical variable to factor type
all$Survived <- as.factor(all$Survived)
all$Pclass <- as.factor(all$Pclass)
all$Pclass <- as.factor(all$Pclass)

index <- which(is.na(all$Survived)) 
train <- all[-index,]
test <- all[index,]

str(all)
## 'data.frame':    1309 obs. of  13 variables:
##  $ PassengerId: int  1 2 3 4 5 7 8 9 10 11 ...
##  $ Survived   : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
##  $ Pclass     : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 1 3 3 2 3 ...
##  $ Name       : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 520 629 417 581 732 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
##  $ Age        : num  22 38 26 35 35 54 2 27 14 4 ...
##  $ SibSp      : int  1 1 0 1 0 0 3 0 1 1 ...
##  $ Parch      : int  0 0 0 0 0 0 1 2 0 1 ...
##  $ Ticket     : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 86 396 345 133 617 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 187 levels "","A10","A14",..: 1 83 1 57 1 131 1 1 1 147 ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 4 4 4 2 4 ...
##  $ family     : num  2 2 1 2 1 1 5 3 2 3 ...
index <- sample(1:nrow(train), nrow(train)*0.8)
train_t <- train[index,]
test_t <- train[-index,]

Logistic Regression

First Model

log.fit <- glm(Survived ~ Pclass + Sex + Age + Fare + Embarked + family, family = "binomial", data = train_t)
summary(log.fit)
## 
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + Fare + Embarked + 
##     family, family = "binomial", data = train_t)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6976  -0.6187  -0.4067   0.6138   2.5528  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  15.401549 535.411352   0.029  0.97705    
## Pclass2      -1.259346   0.337487  -3.732  0.00019 ***
## Pclass3      -2.543961   0.351639  -7.235 4.67e-13 ***
## Sexmale      -2.838304   0.227889 -12.455  < 2e-16 ***
## Age          -0.045196   0.009507  -4.754 2.00e-06 ***
## Fare          0.001457   0.002501   0.583  0.56006    
## EmbarkedC   -10.749439 535.411267  -0.020  0.98398    
## EmbarkedQ   -10.681581 535.411354  -0.020  0.98408    
## EmbarkedS   -10.981700 535.411256  -0.021  0.98364    
## family       -0.234607   0.077335  -3.034  0.00242 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 943.08  on 711  degrees of freedom
## Residual deviance: 615.14  on 702  degrees of freedom
## AIC: 635.14
## 
## Number of Fisher Scoring iterations: 12

Model with important variables

log.fit2 <- glm(Survived ~ Pclass +Sex + Age + family , family = "binomial", data = train_t)
summary(log.fit2)
## 
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + family, family = "binomial", 
##     data = train_t)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7178  -0.5939  -0.4021   0.6187   2.5351  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.711912   0.527812   8.927  < 2e-16 ***
## Pclass2     -1.424161   0.300026  -4.747 2.07e-06 ***
## Pclass3     -2.670890   0.297451  -8.979  < 2e-16 ***
## Sexmale     -2.876431   0.225357 -12.764  < 2e-16 ***
## Age         -0.046654   0.009465  -4.929 8.26e-07 ***
## family      -0.237618   0.074528  -3.188  0.00143 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 943.08  on 711  degrees of freedom
## Residual deviance: 617.04  on 706  degrees of freedom
## AIC: 629.04
## 
## Number of Fisher Scoring iterations: 5

Prediction

test_t$prob <- predict(log.fit2, newdata = test_t, type = "response")
test_t$pred <- ifelse(test_t$prob > 0.5, 1 , 0)
table(test_t$pred, test_t$Survived)
##    
##      0  1
##   0 94 27
##   1 11 47

Linear Discriminant Analysis

Modeling

lda.model = lda (Survived ~ Pclass + Sex + Age + Parch, data=train_t)
lda.model
## Call:
## lda(Survived ~ Pclass + Sex + Age + Parch, data = train_t)
## 
## Prior probabilities of groups:
##         0         1 
## 0.6235955 0.3764045 
## 
## Group means:
##     Pclass2   Pclass3   Sexmale      Age     Parch
## 0 0.1869369 0.6644144 0.8536036 29.63626 0.3265766
## 1 0.2611940 0.3432836 0.3134328 28.36881 0.4253731
## 
## Coefficients of linear discriminants:
##                 LD1
## Pclass2 -0.88656018
## Pclass3 -1.67765423
## Sexmale -2.19786825
## Age     -0.02424166
## Parch   -0.17545492
plot(lda.model)

Prediction

lda_prob <- predict(lda.model, newdata = test_t)
table(Predicted = lda_prob$class, test_t$Survived)
##          
## Predicted  0  1
##         0 91 26
##         1 14 48

Quadratic Discriminant Analysis

Modeling

qda.model = qda (Survived ~ Pclass + Sex + Age + Parch, data=train_t)
qda.model
## Call:
## qda(Survived ~ Pclass + Sex + Age + Parch, data = train_t)
## 
## Prior probabilities of groups:
##         0         1 
## 0.6235955 0.3764045 
## 
## Group means:
##     Pclass2   Pclass3   Sexmale      Age     Parch
## 0 0.1869369 0.6644144 0.8536036 29.63626 0.3265766
## 1 0.2611940 0.3432836 0.3134328 28.36881 0.4253731

Prediction

qda_prob <- predict(qda.model, newdata = test_t)
table(Predicted = qda_prob$class, test_t$Survived)
##          
## Predicted  0  1
##         0 87 23
##         1 18 51

Conclusion

LDA and QDA work well when class separation and normality assumption holds true in the dataset. If the dataset is not normal then Logistic regression has an edge over LDA and QDA model. Logistic regression does not work properly if the response classes are fully separated from each other. In general, logistic regression is used for binomial classification and in case of multiple response classes, LDA and QDA are more popular.