1. Load the data and necessary packages

library(ggplot2)
library(ggthemes)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(GGally)
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
setwd("/Users/milosjanicki/Data_projects/Date_Guide")
titanic_train <- read.csv('Titanic/train.csv')
titanic_test <- read.csv('Titanic/test.csv')
titanic_test$Survived <- NA 
titanic  <- rbind(titanic_train, titanic_test)

2. Perform EDA

survival ??? Survival (0 = No; 1 = Yes) pclass ??? Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd) name ??? Name sex ??? Sex age ??? Age sibsp ??? Number of Siblings/Spouses Aboard parch ??? Number of Parents/Children Aboard ticket ??? Ticket Number fare ??? Passenger Fare cabin ??? Cabin embarked ??? Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

str(titanic)
## 'data.frame':    1309 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 354 273 16 555 516 625 413 577 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 187 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
head(titanic)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp
## 1                             Braund, Mr. Owen Harris   male  22     1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1
## 3                              Heikkinen, Miss. Laina female  26     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1
## 5                            Allen, Mr. William Henry   male  35     0
## 6                                    Moran, Mr. James   male  NA     0
##   Parch           Ticket    Fare Cabin Embarked
## 1     0        A/5 21171  7.2500              S
## 2     0         PC 17599 71.2833   C85        C
## 3     0 STON/O2. 3101282  7.9250              S
## 4     0           113803 53.1000  C123        S
## 5     0           373450  8.0500              S
## 6     0           330877  8.4583              Q

Let’s take a look at a bar plot of survival rates:

ggplot(titanic_train, aes(factor(Survived))) +
  geom_bar()+
  labs(x = "Survived",y = "Passangers")+
  stat_count(aes(label = paste(prop.table(..count..) * 100, "%", sep = "")),
           vjust = 1, geom = "text", position = "identity", color ="white")+
  theme_economist() + scale_colour_economist()

Roughly 2/5 survived…not so bad.

3. Data-Preprocessing and Feature Engineering)

Let’s take one more look at our data - let’s check how many levels our variables have

sapply(titanic, function(x) length(unique(x)))
## PassengerId    Survived      Pclass        Name         Sex         Age 
##        1309           3           3        1307           2          99 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           7           8         929         282         187           4

(don’t be surprised that Survived has 3 levels now - it is because we combined the train and test data set)

It seems reasonable to discard some variables such as : PassangerID, Name,Ticket since they have almost as many values as observations

titanic <- titanic %>% select(-one_of(c("Name", "Ticket")))

Several variables however require transformation into factors: Cabin and Age

Cabin variables combines deck and room data. Let’s try to split this as that might give as an info on the location of the passangers’ cabin.

As we know the titanic cracked in the middle, opening up for the water to flood the decks. The ship sank elevating the bow part while the rear sank first. Perhaps the passangers that were remote from the central part had higher chances of survival? or perhaps those that were in the bow part of this ship as that part sank the latest?

titanic$Deck <- substr(titanic$Cabin, 0, 1)
titanic$Deck <- as.factor(titanic$Deck)
titanic$Location <- substr(titanic$Cabin, 2, 3)
titanic$Location <- as.numeric(titanic$Location)
## Warning: NAs introduced by coercion
summary(titanic$Location)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2.00   13.00   33.00   37.49   53.75   99.00    1027
titanic$Location2[titanic$Location < 14] <- 'The Bow'
titanic$Location2[titanic$Location < 54 & titanic$Location > 13] <- 'The Middle'
titanic$Location2[titanic$Location > 53 & titanic$Location < 100] <- 'The Rear'
titanic$Location <- titanic$Location2
titanic$Location <- as.factor(titanic$Location)
titanic <- titanic %>% select(-one_of(c("Cabin","Location2")))

Let’s see if we made something meaningful

prop.table(table(titanic$Location, titanic$Survived),1)
##             
##                      0         1
##   The Bow    0.3076923 0.6923077
##   The Middle 0.3052632 0.6947368
##   The Rear   0.3673469 0.6326531
prop.table(table(titanic$Deck, titanic$Survived),1)
##    
##             0         1
##     0.7001456 0.2998544
##   A 0.5333333 0.4666667
##   B 0.2553191 0.7446809
##   C 0.4067797 0.5932203
##   D 0.2424242 0.7575758
##   E 0.2500000 0.7500000
##   F 0.3846154 0.6153846
##   G 0.5000000 0.5000000
##   T 1.0000000 0.0000000

As expected passangers in the rear part have slightly lower chances of survival. The central cabins seem not to be affected. Certain decks seem to have larger chances of survival, let’s hope it is not merely noise.

Also let’s create a new variable for children as it seems to be a good idea

titanic$Age[is.na(titanic$Age)] <- mean(titanic$Age, na.rm=TRUE)
titanic$Age_cat <- NA
titanic$Age_cat[titanic$Age < 16] <- "Child"
titanic$Age_cat[titanic$Age >= 16] <- "Adult"
titanic$Age_cat <- as.factor(titanic$Age_cat)

Let’s Create a family size vairable too

titanic$family_size <- NA
titanic$family_size <- titanic$SibSp + titanic$Parch + 1
summary(titanic$family_size)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.884   2.000  11.000
titanic$family_size <- as.factor(titanic$family_size)

Let’s see if we did something meaningul

prop.table(table(titanic$family_size, titanic$Survived),1)
##     
##              0         1
##   1  0.6964618 0.3035382
##   2  0.4472050 0.5527950
##   3  0.4215686 0.5784314
##   4  0.2758621 0.7241379
##   5  0.8000000 0.2000000
##   6  0.8636364 0.1363636
##   7  0.6666667 0.3333333
##   8  1.0000000 0.0000000
##   11 1.0000000 0.0000000
prop.table(table(titanic$Age_cat, titanic$Survived),1)
##        
##                 0         1
##   Adult 0.6373762 0.3626238
##   Child 0.4096386 0.5903614

Children have a better survival rate. When it comes to the family size large families are doing very badly while couples alone or couples with a child (or another family member) seem to have the highest surival rates.

Now the dataset looks better but we have to deal with the problem of missing data

First let’s fill in the empty cells with NA values

#titanic[sapply(titanic, function(x) as.character(x)=="" & as.character(x)=="NA" )] <- NA
sum(is.na(titanic))
## [1] 1446
sapply(titanic,function(x) sum(is.na(x)))
## PassengerId    Survived      Pclass         Sex         Age       SibSp 
##           0         418           0           0           0           0 
##       Parch        Fare    Embarked        Deck    Location     Age_cat 
##           0           1           0           0        1027           0 
## family_size 
##           0
missmap(titanic, main = "Missing values vs observed")

We still have 1972 NA values in Location, Deck, Embarked, Cabin and Age variables

How to deal with them?

#Let's fill in the Age with median value
titanic$Age[is.na(titanic$Age)] <- mean(titanic$Age, na.rm=TRUE)
titanic$Fare[is.na(titanic$Fare)] <- mean(titanic$Fare, na.rm=TRUE)
titanic$Sex<- as.factor(titanic$Sex)
titanic$Embarked<- as.factor(titanic$Embarked)
titanic$Survived <- as.factor(titanic$Survived)
titanic$family_size <- as.factor(titanic$family_size)
sapply(titanic,function(x) sum(is.na(x)))
## PassengerId    Survived      Pclass         Sex         Age       SibSp 
##           0         418           0           0           0           0 
##       Parch        Fare    Embarked        Deck    Location     Age_cat 
##           0           0           0           0        1027           0 
## family_size 
##           0
missmap(titanic, main = "Missing values vs observed")

Model will deal with categorical variables on its own from what I understood as nobody on the web bothers to fill in the NA’s with randomized distribution of some sort

Now let’s take a look at our survival predictions:

titanic_check <- titanic %>% select(-one_of(c("PassengerId","Name")))
## Warning in one_of(c("PassengerId", "Name")): Unknown variables: `Name`
sapply(titanic_check,function(x) prop.table(table(x,titanic_check$Survived),1))
## $Survived
##    
## x   0 1
##   0 1 0
##   1 0 1
## 
## $Pclass
##    
## x           0         1
##   1 0.3703704 0.6296296
##   2 0.5271739 0.4728261
##   3 0.7576375 0.2423625
## 
## $Sex
##         
## x                0         1
##   female 0.2579618 0.7420382
##   male   0.8110919 0.1889081
## 
## $Age
##                  
## x                         0         1
##   0.17                               
##   0.33                               
##   0.42            0.0000000 1.0000000
##   0.67            0.0000000 1.0000000
##   0.75            0.0000000 1.0000000
##   0.83            0.0000000 1.0000000
##   0.92            0.0000000 1.0000000
##   1               0.2857143 0.7142857
##   2               0.7000000 0.3000000
##   3               0.1666667 0.8333333
##   4               0.3000000 0.7000000
##   5               0.0000000 1.0000000
##   6               0.3333333 0.6666667
##   7               0.6666667 0.3333333
##   8               0.5000000 0.5000000
##   9               0.7500000 0.2500000
##   10              1.0000000 0.0000000
##   11              0.7500000 0.2500000
##   11.5                               
##   12              0.0000000 1.0000000
##   13              0.0000000 1.0000000
##   14              0.5000000 0.5000000
##   14.5            1.0000000 0.0000000
##   15              0.2000000 0.8000000
##   16              0.6470588 0.3529412
##   17              0.5384615 0.4615385
##   18              0.6538462 0.3461538
##   18.5                               
##   19              0.6400000 0.3600000
##   20              0.8000000 0.2000000
##   20.5            1.0000000 0.0000000
##   21              0.7916667 0.2083333
##   22              0.5925926 0.4074074
##   22.5                               
##   23              0.6666667 0.3333333
##   23.5            1.0000000 0.0000000
##   24              0.5000000 0.5000000
##   24.5            1.0000000 0.0000000
##   25              0.7391304 0.2608696
##   26              0.6666667 0.3333333
##   26.5                               
##   27              0.3888889 0.6111111
##   28              0.7200000 0.2800000
##   28.5            1.0000000 0.0000000
##   29              0.6000000 0.4000000
##   29.881137667304 0.7062147 0.2937853
##   30              0.6000000 0.4000000
##   30.5            1.0000000 0.0000000
##   31              0.5294118 0.4705882
##   32              0.5000000 0.5000000
##   32.5            0.5000000 0.5000000
##   33              0.6000000 0.4000000
##   34              0.6000000 0.4000000
##   34.5            1.0000000 0.0000000
##   35              0.3888889 0.6111111
##   36              0.5000000 0.5000000
##   36.5            1.0000000 0.0000000
##   37              0.8333333 0.1666667
##   38              0.5454545 0.4545455
##   38.5                               
##   39              0.6428571 0.3571429
##   40              0.5384615 0.4615385
##   40.5            1.0000000 0.0000000
##   41              0.6666667 0.3333333
##   42              0.5384615 0.4615385
##   43              0.8000000 0.2000000
##   44              0.6666667 0.3333333
##   45              0.5833333 0.4166667
##   45.5            1.0000000 0.0000000
##   46              1.0000000 0.0000000
##   47              0.8888889 0.1111111
##   48              0.3333333 0.6666667
##   49              0.3333333 0.6666667
##   50              0.5000000 0.5000000
##   51              0.7142857 0.2857143
##   52              0.5000000 0.5000000
##   53              0.0000000 1.0000000
##   54              0.6250000 0.3750000
##   55              0.5000000 0.5000000
##   55.5            1.0000000 0.0000000
##   56              0.5000000 0.5000000
##   57              1.0000000 0.0000000
##   58              0.4000000 0.6000000
##   59              1.0000000 0.0000000
##   60              0.5000000 0.5000000
##   60.5                               
##   61              1.0000000 0.0000000
##   62              0.5000000 0.5000000
##   63              0.0000000 1.0000000
##   64              1.0000000 0.0000000
##   65              1.0000000 0.0000000
##   66              1.0000000 0.0000000
##   67                                 
##   70              1.0000000 0.0000000
##   70.5            1.0000000 0.0000000
##   71              1.0000000 0.0000000
##   74              1.0000000 0.0000000
##   76                                 
##   80              0.0000000 1.0000000
## 
## $SibSp
##    
## x           0         1
##   0 0.6546053 0.3453947
##   1 0.4641148 0.5358852
##   2 0.5357143 0.4642857
##   3 0.7500000 0.2500000
##   4 0.8333333 0.1666667
##   5 1.0000000 0.0000000
##   8 1.0000000 0.0000000
## 
## $Parch
##    
## x           0         1
##   0 0.6563422 0.3436578
##   1 0.4491525 0.5508475
##   2 0.5000000 0.5000000
##   3 0.4000000 0.6000000
##   4 1.0000000 0.0000000
##   5 0.8000000 0.2000000
##   6 1.0000000 0.0000000
##   9                    
## 
## $Fare
##                   
## x                           0          1
##   0                0.93333333 0.06666667
##   3.1708                                
##   4.0125           1.00000000 0.00000000
##   5                1.00000000 0.00000000
##   6.2375           1.00000000 0.00000000
##   6.4375           1.00000000 0.00000000
##   6.45             1.00000000 0.00000000
##   6.4958           1.00000000 0.00000000
##   6.75             1.00000000 0.00000000
##   6.8583           1.00000000 0.00000000
##   6.95             1.00000000 0.00000000
##   6.975            0.50000000 0.50000000
##   7                                     
##   7.0458           1.00000000 0.00000000
##   7.05             1.00000000 0.00000000
##   7.0542           1.00000000 0.00000000
##   7.125            1.00000000 0.00000000
##   7.1417           0.00000000 1.00000000
##   7.225            0.75000000 0.25000000
##   7.2292           0.73333333 0.26666667
##   7.25             0.92307692 0.07692308
##   7.2833                                
##   7.3125           1.00000000 0.00000000
##   7.4958           0.66666667 0.33333333
##   7.5208           1.00000000 0.00000000
##   7.55             0.75000000 0.25000000
##   7.575                                 
##   7.5792                                
##   7.6292           1.00000000 0.00000000
##   7.65             0.75000000 0.25000000
##   7.7208                                
##   7.725            1.00000000 0.00000000
##   7.7292           1.00000000 0.00000000
##   7.7333           0.50000000 0.50000000
##   7.7375           0.50000000 0.50000000
##   7.7417           1.00000000 0.00000000
##   7.75             0.64705882 0.35294118
##   7.775            0.81250000 0.18750000
##   7.7792                                
##   7.7875           0.00000000 1.00000000
##   7.7958           0.66666667 0.33333333
##   7.8              1.00000000 0.00000000
##   7.8208                                
##   7.8292           0.50000000 0.50000000
##   7.85                                  
##   7.8542           0.76923077 0.23076923
##   7.875            1.00000000 0.00000000
##   7.8792           0.00000000 1.00000000
##   7.8875           1.00000000 0.00000000
##   7.8958           0.97368421 0.02631579
##   7.925            0.55555556 0.44444444
##   8.0292           0.00000000 1.00000000
##   8.05             0.88372093 0.11627907
##   8.1125           0.00000000 1.00000000
##   8.1375           1.00000000 0.00000000
##   8.1583           1.00000000 0.00000000
##   8.3              1.00000000 0.00000000
##   8.3625           1.00000000 0.00000000
##   8.4042           1.00000000 0.00000000
##   8.4333           1.00000000 0.00000000
##   8.4583           1.00000000 0.00000000
##   8.5167           0.00000000 1.00000000
##   8.6542           1.00000000 0.00000000
##   8.6625           0.92307692 0.07692308
##   8.6833           0.00000000 1.00000000
##   8.7125           1.00000000 0.00000000
##   8.85             1.00000000 0.00000000
##   8.9625                                
##   9                1.00000000 0.00000000
##   9.2167           1.00000000 0.00000000
##   9.225            1.00000000 0.00000000
##   9.325                                 
##   9.35             0.50000000 0.50000000
##   9.475            1.00000000 0.00000000
##   9.4833           1.00000000 0.00000000
##   9.5              0.77777778 0.22222222
##   9.5875           0.50000000 0.50000000
##   9.6875                                
##   9.825            1.00000000 0.00000000
##   9.8375           1.00000000 0.00000000
##   9.8417           0.00000000 1.00000000
##   9.8458           1.00000000 0.00000000
##   10.1708          1.00000000 0.00000000
##   10.4625          1.00000000 0.00000000
##   10.5             0.62500000 0.37500000
##   10.5167          1.00000000 0.00000000
##   10.7083                               
##   11.1333          0.00000000 1.00000000
##   11.2417          0.00000000 1.00000000
##   11.5             1.00000000 0.00000000
##   12               0.00000000 1.00000000
##   12.1833                               
##   12.275           1.00000000 0.00000000
##   12.2875          0.00000000 1.00000000
##   12.35            0.33333333 0.66666667
##   12.475           0.00000000 1.00000000
##   12.525           1.00000000 0.00000000
##   12.65            0.00000000 1.00000000
##   12.7375                               
##   12.875           1.00000000 0.00000000
##   13               0.61904762 0.38095238
##   13.4167          0.00000000 1.00000000
##   13.5             0.75000000 0.25000000
##   13.775                                
##   13.7917          0.00000000 1.00000000
##   13.8583          0.00000000 1.00000000
##   13.8625          0.00000000 1.00000000
##   13.9                                  
##   14               1.00000000 0.00000000
##   14.1083          1.00000000 0.00000000
##   14.4             1.00000000 0.00000000
##   14.4542          0.85714286 0.14285714
##   14.4583          1.00000000 0.00000000
##   14.5             0.71428571 0.28571429
##   15               1.00000000 0.00000000
##   15.0333                               
##   15.0458          1.00000000 0.00000000
##   15.05            1.00000000 0.00000000
##   15.1             1.00000000 0.00000000
##   15.2458          0.40000000 0.60000000
##   15.5             0.62500000 0.37500000
##   15.55            1.00000000 0.00000000
##   15.5792                               
##   15.7417          0.00000000 1.00000000
##   15.75            0.00000000 1.00000000
##   15.85            0.50000000 0.50000000
##   15.9             0.00000000 1.00000000
##   16               0.00000000 1.00000000
##   16.1             0.77777778 0.22222222
##   16.7             0.00000000 1.00000000
##   17.4             0.00000000 1.00000000
##   17.8             1.00000000 0.00000000
##   18               1.00000000 0.00000000
##   18.75            0.00000000 1.00000000
##   18.7875          0.50000000 0.50000000
##   19.2583          0.00000000 1.00000000
##   19.5             0.00000000 1.00000000
##   19.9667          1.00000000 0.00000000
##   20.2125          1.00000000 0.00000000
##   20.25            0.50000000 0.50000000
##   20.525           0.33333333 0.66666667
##   20.575           0.50000000 0.50000000
##   21               0.66666667 0.33333333
##   21.075           1.00000000 0.00000000
##   21.6792          1.00000000 0.00000000
##   22.025           0.00000000 1.00000000
##   22.3583          0.00000000 1.00000000
##   22.525           1.00000000 0.00000000
##   23               0.00000000 1.00000000
##   23.25            0.00000000 1.00000000
##   23.45            1.00000000 0.00000000
##   24               0.50000000 0.50000000
##   24.15            0.87500000 0.12500000
##   25.4667          1.00000000 0.00000000
##   25.5875          1.00000000 0.00000000
##   25.7                                  
##   25.7417                               
##   25.925           1.00000000 0.00000000
##   25.9292          0.00000000 1.00000000
##   26               0.51612903 0.48387097
##   26.25            0.33333333 0.66666667
##   26.2833          0.00000000 1.00000000
##   26.2875          0.00000000 1.00000000
##   26.3875          0.00000000 1.00000000
##   26.55            0.46666667 0.53333333
##   27               0.50000000 0.50000000
##   27.4458                               
##   27.7208          0.80000000 0.20000000
##   27.75            0.50000000 0.50000000
##   27.9             1.00000000 0.00000000
##   28.5             1.00000000 0.00000000
##   28.5375                               
##   28.7125          1.00000000 0.00000000
##   29               0.00000000 1.00000000
##   29.125           1.00000000 0.00000000
##   29.7             0.66666667 0.33333333
##   30               0.16666667 0.83333333
##   30.0708          0.50000000 0.50000000
##   30.5             0.20000000 0.80000000
##   30.6958          1.00000000 0.00000000
##   31               0.33333333 0.66666667
##   31.275           1.00000000 0.00000000
##   31.3875          0.25000000 0.75000000
##   31.5                                  
##   31.6792                               
##   31.6833                               
##   32.3208          1.00000000 0.00000000
##   32.5             0.00000000 1.00000000
##   33               0.33333333 0.66666667
##   33.2954792813456                      
##   33.5             1.00000000 0.00000000
##   34.0208          1.00000000 0.00000000
##   34.375           1.00000000 0.00000000
##   34.6542          1.00000000 0.00000000
##   35               1.00000000 0.00000000
##   35.5             0.25000000 0.75000000
##   36.75            0.50000000 0.50000000
##   37.0042          0.50000000 0.50000000
##   38.5             1.00000000 0.00000000
##   39               0.25000000 0.75000000
##   39.4             0.00000000 1.00000000
##   39.6             0.50000000 0.50000000
##   39.6875          1.00000000 0.00000000
##   40.125           1.00000000 0.00000000
##   41.5792          0.33333333 0.66666667
##   42.4             1.00000000 0.00000000
##   42.5                                  
##   45.5                                  
##   46.9             1.00000000 0.00000000
##   47.1             1.00000000 0.00000000
##   49.5             0.00000000 1.00000000
##   49.5042          0.50000000 0.50000000
##   50               1.00000000 0.00000000
##   50.4958          1.00000000 0.00000000
##   51.4792          0.00000000 1.00000000
##   51.8625          0.50000000 0.50000000
##   52               0.57142857 0.42857143
##   52.5542          0.00000000 1.00000000
##   53.1             0.40000000 0.60000000
##   55               0.00000000 1.00000000
##   55.4417          0.00000000 1.00000000
##   55.9             0.50000000 0.50000000
##   56.4958          0.28571429 0.71428571
##   56.9292          0.00000000 1.00000000
##   57               0.00000000 1.00000000
##   57.75                                 
##   57.9792          0.00000000 1.00000000
##   59.4             0.00000000 1.00000000
##   60                                    
##   61.175           1.00000000 0.00000000
##   61.3792          1.00000000 0.00000000
##   61.9792          1.00000000 0.00000000
##   63.3583          0.00000000 1.00000000
##   65               0.00000000 1.00000000
##   66.6             0.50000000 0.50000000
##   69.3             0.00000000 1.00000000
##   69.55            1.00000000 0.00000000
##   71               0.50000000 0.50000000
##   71.2833          0.00000000 1.00000000
##   73.5             1.00000000 0.00000000
##   75.2417                               
##   75.25            0.00000000 1.00000000
##   76.2917          0.00000000 1.00000000
##   76.7292          0.00000000 1.00000000
##   77.2875          1.00000000 0.00000000
##   77.9583          0.00000000 1.00000000
##   78.2667          0.00000000 1.00000000
##   78.85            0.50000000 0.50000000
##   79.2             0.50000000 0.50000000
##   79.65            0.33333333 0.66666667
##   80               0.00000000 1.00000000
##   81.8583          0.00000000 1.00000000
##   82.1708          0.50000000 0.50000000
##   82.2667                               
##   83.1583          0.00000000 1.00000000
##   83.475           0.50000000 0.50000000
##   86.5             0.00000000 1.00000000
##   89.1042          0.00000000 1.00000000
##   90               0.25000000 0.75000000
##   91.0792          0.00000000 1.00000000
##   93.5             0.00000000 1.00000000
##   106.425          0.50000000 0.50000000
##   108.9            0.50000000 0.50000000
##   110.8833         0.25000000 0.75000000
##   113.275          0.33333333 0.66666667
##   120              0.00000000 1.00000000
##   133.65           0.00000000 1.00000000
##   134.5            0.00000000 1.00000000
##   135.6333         0.33333333 0.66666667
##   136.7792                              
##   146.5208         0.00000000 1.00000000
##   151.55           0.50000000 0.50000000
##   153.4625         0.33333333 0.66666667
##   164.8667         0.00000000 1.00000000
##   211.3375         0.00000000 1.00000000
##   211.5            1.00000000 0.00000000
##   221.7792         1.00000000 0.00000000
##   227.525          0.25000000 0.75000000
##   247.5208         0.50000000 0.50000000
##   262.375          0.00000000 1.00000000
##   263              0.50000000 0.50000000
##   512.3292         0.00000000 1.00000000
## 
## $Embarked
##    
## x           0         1
##     0.0000000 1.0000000
##   C 0.4464286 0.5535714
##   Q 0.6103896 0.3896104
##   S 0.6630435 0.3369565
## 
## $Deck
##    
## x           0         1
##     0.7001456 0.2998544
##   A 0.5333333 0.4666667
##   B 0.2553191 0.7446809
##   C 0.4067797 0.5932203
##   D 0.2424242 0.7575758
##   E 0.2500000 0.7500000
##   F 0.3846154 0.6153846
##   G 0.5000000 0.5000000
##   T 1.0000000 0.0000000
## 
## $Location
##             
## x                    0         1
##   The Bow    0.3076923 0.6923077
##   The Middle 0.3052632 0.6947368
##   The Rear   0.3673469 0.6326531
## 
## $Age_cat
##        
## x               0         1
##   Adult 0.6373762 0.3626238
##   Child 0.4096386 0.5903614
## 
## $family_size
##     
## x            0         1
##   1  0.6964618 0.3035382
##   2  0.4472050 0.5527950
##   3  0.4215686 0.5784314
##   4  0.2758621 0.7241379
##   5  0.8000000 0.2000000
##   6  0.8636364 0.1363636
##   7  0.6666667 0.3333333
##   8  1.0000000 0.0000000
##   11 1.0000000 0.0000000

..sex and infancy seem to be the crucical predictors.

Looks promising

Finally, let’s split the data set again and standaride the numerics

train <- titanic[1:891,]
test <- titanic[892:1309,]
train <- as.data.frame(train)
test <- as.data.frame(test)
test$Survived <- ""
titanic_pp <- preProcess(train, method = c("center", "scale"))
titanic_modelling <- predict(titanic_pp, newdata = train)
#attach(titanic_modelling)
  1. Construct a model

Let’s start with logistic regression model

logistic.regression.model <- glm(Survived ~ ., family = binomial(link = "logit"), data = titanic_modelling)

Let’s test the model

anova(logistic.regression.model, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Survived
## 
## Terms added sequentially (first to last)
## 
## 
##             Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                          195     246.15              
## PassengerId  1    2.692       194     243.46  0.100840    
## Pclass       1    0.338       193     243.12  0.560997    
## Sex          1   62.216       192     180.91 3.078e-15 ***
## Age          1    7.214       191     173.69  0.007232 ** 
## SibSp        1    0.275       190     173.42  0.599673    
## Parch        1    0.990       189     172.43  0.319715    
## Fare         1    0.279       188     172.15  0.597119    
## Embarked     3    3.328       185     168.82  0.343697    
## Deck         6   13.106       179     155.71  0.041388 *  
## Location     2    2.397       177     153.32  0.301660    
## Age_cat      1    0.655       176     152.66  0.418475    
## family_size  4    2.941       172     149.72  0.567809    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
varImp(logistic.regression.model)
##                        Overall
## PassengerId        0.970350559
## Pclass             0.617616258
## Sexmale            5.661213193
## Age                2.275369797
## SibSp              0.485112383
## Parch              1.642091028
## Fare               0.812727039
## EmbarkedC          0.007803637
## EmbarkedQ          0.009226067
## EmbarkedS          0.008288520
## DeckB              0.063501720
## DeckC              0.603912015
## DeckD              0.499807776
## DeckE              0.953100938
## DeckF              0.446196542
## DeckG              2.160011337
## LocationThe Middle 0.307049564
## LocationThe Rear   1.646764750
## Age_catChild       0.651022534
## family_size2       0.501579984
## family_size3       1.594096181
## family_size4       0.582385628
## family_size5       0.008161283

Several variables have very low deviance - let’s get rid of them: Sibsp, Fare, PcClass, Parch. I assume there might be a multicolinearity problem so let’s get rid of Age too

tm <- titanic_modelling %>% select(-one_of(c("SibSp", "Fare","Embarked","Age","Parch")))

And try one more time

logistic.regression.model.smaller <- glm(Survived ~ ., family = binomial(link = "logit"), data = tm)
anova(logistic.regression.model.smaller, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Survived
## 
## Terms added sequentially (first to last)
## 
## 
##             Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                          195     246.15              
## PassengerId  1    2.692       194     243.46   0.10084    
## Pclass       1    0.338       193     243.12   0.56100    
## Sex          1   62.216       192     180.91 3.078e-15 ***
## Deck         6   10.061       186     170.85   0.12210    
## Location     2    1.864       184     168.98   0.39367    
## Age_cat      1    4.359       183     164.62   0.03681 *  
## family_size  5    2.267       178     162.35   0.81104    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
varImp(logistic.regression.model.smaller)
##                       Overall
## PassengerId        1.46723017
## Pclass             0.39644876
## Sexmale            5.98112715
## DeckB              0.19257967
## DeckC              0.49529588
## DeckD              0.38152506
## DeckE              0.58664719
## DeckF              0.44579884
## DeckG              2.16893780
## LocationThe Middle 0.50937648
## LocationThe Rear   1.45294779
## Age_catChild       1.69465374
## family_size2       0.96109478
## family_size3       0.88469921
## family_size4       0.33283567
## family_size5       0.01266958
## family_size6       0.39541062

Cross-validation

fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 5,
                           repeats = 5
                           )

This part used to work before i updated R, now i am even more clueless

# gradient boosting
#gbmFit1 <- train(Survived ~ ., data = tm, 
#                 method = "gbm", 
#                 trControl = fitControl,
#                 verbose = FALSE)

# random forest
#rfFit1 <- train(Survived ~ ., data = tm,
#                method = "rf",
#                trControl = fitControl)

# neural network
#nnFit1 <- train(Survived ~ ., data = tm,
#                method = "avNNet", trControl = fitControl, linout = TRUE)
#summary(gbmFit1)
#plot(gbmFit1)
#summary(rfFit1)
#plot(rfFit1)
#summary(nnFit1)
#plot(nnFit1)
  1. Fit the model
#log.predictions <- predict(logistic.regression.model, newdata = test)
#log.predictions.smaller <- predict(logistic.regression.model.smaller, newdata = test)
#gbm.predictions <- predict(gbmFit1, newdata = test)
#rf.predictions <- predict(rfFit1, newdata = test)
#nnet.predictions <- predict(nnFit1, newdata = test)
  1. Prediction (confusion matrix,ROC,AUC)
#predicted values for traindata:
#confusionMatrix(gbm.predictions,test$Survived)
#confusionMatrix(rf.predictions,test$Survived)
#confusionMatrix(nnet.predictions,test$Survived)

I was experiencing problems with confusion matrix, roc curve etc. I guess i did something wrong but i was not able to find it

#solution1 <- data.frame(PassengerID = test$PassengerId, Survived = gbm.predictions)
#solution2 <- data.frame(PassengerID = test$PassengerId, Survived = rf.predictions)
#solution3 <- data.frame(PassengerID = test$PassengerId, Survived = nnet.predictions)

# Write the solution to file
#write.csv(solution1, file = 'gbm_mod_Solution.csv', row.names = F)
#write.csv(solution2, file = 'rf_mod_Solution.csv', row.names = F)
#write.csv(solution3, file = 'nnet_mod_Solution.csv', row.names = F)