Logistic Regression on Titanic Dataset

R Markdown

## Load Titanic library to get the dataset
library(titanic)

## Load the datasets
data("titanic_train")
data("titanic_test")

## Setting Survived column for test data to NA
titanic_test$Survived <- NA

## Combining Training and Testing dataset
complete_data <- rbind(titanic_train, titanic_test)

## Check data structure
str(complete_data)

## 'data.frame':    1309 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...

## Let's check for any missing values in the data
colSums(is.na(complete_data))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0         418           0           0           0         263 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           1           0           0

## Checking for empty values
colSums(complete_data=='')

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0          NA           0           0           0          NA 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0          NA        1014           2

## Check number of uniques values for each of the column to find out columns which we can convert to factors
sapply(complete_data, function(x) length(unique(x)))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##        1309           3           3        1307           2          99 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           7           8         929         282         187           4

## Missing values imputation
complete_data$Embarked[complete_data$Embarked==""] <- "S"
complete_data$Age[is.na(complete_data$Age)] <- median(complete_data$Age,na.rm=T)

## Removing Cabin as it has very high missing values, passengerId, Ticket and Name are not required
library(dplyr)
titanic_data <- complete_data %>% select(-c(Cabin, PassengerId, Ticket, Name))

## Converting "Survived","Pclass","Sex","Embarked" to factors
for (i in c("Survived","Pclass","Sex","Embarked")){
  titanic_data[,i]=as.factor(titanic_data[,i])
}

## Create dummy variables for categorical variables
library(dummies)
titanic_data <- dummy.data.frame(titanic_data, names=c("Pclass","Sex","Embarked"), sep="_")

## Splitting training and test data
train <- titanic_data[1:667,]
test <- titanic_data[668:889,]

## Model Creation
model <- glm(Survived ~.,family=binomial(link='logit'),data=train)

## Model Summary
summary(model)

## 
## Call:
## glm(formula = Survived ~ ., family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3804  -0.6562  -0.4300   0.6392   2.3950  
## 
## Coefficients: (3 not defined because of singularities)
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.373105   0.319779  -4.294 1.76e-05 ***
## Pclass_1     2.175104   0.359365   6.053 1.42e-09 ***
## Pclass_2     1.302268   0.271680   4.793 1.64e-06 ***
## Pclass_3           NA         NA      NA       NA    
## Sex_female   2.677814   0.226863  11.804  < 2e-16 ***
## Sex_male           NA         NA      NA       NA    
## Age         -0.031671   0.008945  -3.540 0.000399 ***
## SibSp       -0.248975   0.123365  -2.018 0.043570 *  
## Parch       -0.091603   0.141950  -0.645 0.518718    
## Fare        -0.001397   0.003179  -0.440 0.660254    
## Embarked_C   0.431447   0.271693   1.588 0.112288    
## Embarked_Q   0.533193   0.369337   1.444 0.148837    
## Embarked_S         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 891.99  on 666  degrees of freedom
## Residual deviance: 605.78  on 657  degrees of freedom
## AIC: 625.78
## 
## Number of Fisher Scoring iterations: 5

## Using anova() to analyze the table of devaiance
anova(model, test="Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Survived
## 
## Terms added sequentially (first to last)
## 
## 
##            Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                         666     891.99              
## Pclass_1    1   39.603       665     852.39 3.112e-10 ***
## Pclass_2    1   26.485       664     825.91 2.655e-07 ***
## Pclass_3    0    0.000       664     825.91              
## Sex_female  1  197.978       663     627.93 < 2.2e-16 ***
## Sex_male    0    0.000       663     627.93              
## Age         1    8.986       662     618.94  0.002721 ** 
## SibSp       1    8.114       661     610.83  0.004393 ** 
## Parch       1    0.998       660     609.83  0.317889    
## Fare        1    0.044       659     609.79  0.834588    
## Embarked_C  1    1.936       658     607.85  0.164139    
## Embarked_Q  1    2.067       657     605.78  0.150485    
## Embarked_S  0    0.000       657     605.78              
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## Predicting Test Data
result <- predict(model,newdata=test,type='response')
result <- ifelse(result > 0.5,1,0)

## Confusion matrix and statistics
library(caret)
confusionMatrix(data=result, reference=test$Survived)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 128  25
##          1  13  56
##                                           
##                Accuracy : 0.8288          
##                  95% CI : (0.7727, 0.8759)
##     No Information Rate : 0.6351          
##     P-Value [Acc > NIR] : 1.817e-10       
##                                           
##                   Kappa : 0.6187          
##  Mcnemar's Test P-Value : 0.07435         
##                                           
##             Sensitivity : 0.9078          
##             Specificity : 0.6914          
##          Pos Pred Value : 0.8366          
##          Neg Pred Value : 0.8116          
##              Prevalence : 0.6351          
##          Detection Rate : 0.5766          
##    Detection Prevalence : 0.6892          
##       Balanced Accuracy : 0.7996          
##                                           
##        'Positive' Class : 0               
##

## ROC Curve and calculating the area under the curve(AUC)
library(ROCR)
predictions <- predict(model, newdata=test, type="response")
ROCRpred <- prediction(predictions, test$Survived)
ROCRperf <- performance(ROCRpred, measure = "tpr", x.measure = "fpr")

plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2,1.7), print.cutoffs.at = seq(0,1,0.1))

auc <- performance(ROCRpred, measure = "auc")
auc <- auc@y.values[[1]]
auc

## [1] 0.8714211

Logistic Regression on Titanic Dataset

Abhay Padda

29 October 2017

R Markdown