1. Using the packages

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(Hmisc)

## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(rms)

## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## 
## The following object is masked from 'package:base':
## 
##     backsolve

library(mice)

## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind

2. Importing data

## Getting the data from Hmisc and rms libraries getHdata(titanic3) 
## assign it to a variable called “titanic” titanic<-titanic3
getHdata(titanic3) 
titanic <- titanic3

# Exploration the data
glimpse(titanic)

## Rows: 1,309
## Columns: 14
## $ pclass    <fct> 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, …
## $ survived  <labelled> 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1…
## $ name      <labelled> "Allen, Miss. Elisabeth Walton", "Allison, Master. Huds…
## $ sex       <fct> female, male, female, male, female, male, female, male, fema…
## $ age       <labelled> 29.0000, 0.9167, 2.0000, 30.0000, 25.0000, 48.0000, 63.…
## $ sibsp     <labelled> 0, 1, 1, 1, 1, 0, 1, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0…
## $ parch     <labelled> 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0…
## $ ticket    <labelled> "24160", "113781", "113781", "113781", "113781", "19952…
## $ fare      <labelled> 211.3375, 151.5500, 151.5500, 151.5500, 151.5500, 26.55…
## $ cabin     <fct> B5, C22 C26, C22 C26, C22 C26, C22 C26, E12, D7, A36, C101, …
## $ embarked  <fct> Southampton, Southampton, Southampton, Southampton, Southamp…
## $ boat      <fct> 2, 11, , , , 3, 10, , D, , , 4, 9, 6, B, , , 6, 8, A, 5, 5, …
## $ body      <labelled> NA, NA, NA, 135, NA, NA, NA, NA, NA, 22, 124, NA, NA, N…
## $ home.dest <labelled> "St Louis, MO", "Montreal, PQ / Chesterville, ON", "Mon…

dim(titanic)

## [1] 1309   14

head(titanic, 10)

##    pclass survived                            name    sex     age sibsp parch
## 1     1st        1   Allen, Miss. Elisabeth Walton female 29.0000     0     0
## 2     1st        1  Allison, Master. Hudson Trevor   male  0.9167     1     2
## 3     1st        0    Allison, Miss. Helen Loraine female  2.0000     1     2
## 4     1st        0 Allison, Mr. Hudson Joshua Crei   male 30.0000     1     2
## 5     1st        0 Allison, Mrs. Hudson J C (Bessi female 25.0000     1     2
## 6     1st        1             Anderson, Mr. Harry   male 48.0000     0     0
## 7     1st        1 Andrews, Miss. Kornelia Theodos female 63.0000     1     0
## 8     1st        0          Andrews, Mr. Thomas Jr   male 39.0000     0     0
## 9     1st        1 Appleton, Mrs. Edward Dale (Cha female 53.0000     2     0
## 10    1st        0         Artagaveytia, Mr. Ramon   male 71.0000     0     0
##      ticket     fare   cabin    embarked boat body
## 1     24160 211.3375      B5 Southampton    2   NA
## 2    113781 151.5500 C22 C26 Southampton   11   NA
## 3    113781 151.5500 C22 C26 Southampton        NA
## 4    113781 151.5500 C22 C26 Southampton       135
## 5    113781 151.5500 C22 C26 Southampton        NA
## 6     19952  26.5500     E12 Southampton    3   NA
## 7     13502  77.9583      D7 Southampton   10   NA
## 8    112050   0.0000     A36 Southampton        NA
## 9     11769  51.4792    C101 Southampton    D   NA
## 10 PC 17609  49.5042           Cherbourg        22
##                          home.dest
## 1                     St Louis, MO
## 2  Montreal, PQ / Chesterville, ON
## 3  Montreal, PQ / Chesterville, ON
## 4  Montreal, PQ / Chesterville, ON
## 5  Montreal, PQ / Chesterville, ON
## 6                     New York, NY
## 7                       Hudson, NY
## 8                      Belfast, NI
## 9              Bayside, Queens, NY
## 10             Montevideo, Uruguay

tail(titanic, 10)

##      pclass survived                            name    sex  age sibsp parch
## 1300    3rd        0             Yasbeck, Mr. Antoni   male 27.0     1     0
## 1301    3rd        1 Yasbeck, Mrs. Antoni (Selini Al female 15.0     1     0
## 1302    3rd        0            Youseff, Mr. Gerious   male 45.5     0     0
## 1303    3rd        0               Yousif, Mr. Wazli   male   NA     0     0
## 1304    3rd        0           Yousseff, Mr. Gerious   male   NA     0     0
## 1305    3rd        0            Zabour, Miss. Hileni female 14.5     1     0
## 1306    3rd        0           Zabour, Miss. Thamine female   NA     1     0
## 1307    3rd        0       Zakarian, Mr. Mapriededer   male 26.5     0     0
## 1308    3rd        0             Zakarian, Mr. Ortin   male 27.0     0     0
## 1309    3rd        0              Zimmerman, Mr. Leo   male 29.0     0     0
##      ticket    fare cabin    embarked boat body home.dest
## 1300   2659 14.4542         Cherbourg    C   NA          
## 1301   2659 14.4542         Cherbourg        NA          
## 1302   2628  7.2250         Cherbourg       312          
## 1303   2647  7.2250         Cherbourg        NA          
## 1304   2627 14.4583         Cherbourg        NA          
## 1305   2665 14.4542         Cherbourg       328          
## 1306   2665 14.4542         Cherbourg        NA          
## 1307   2656  7.2250         Cherbourg       304          
## 1308   2670  7.2250         Cherbourg        NA          
## 1309 315082  7.8750       Southampton        NA

str(titanic)

## 'data.frame':    1309 obs. of  14 variables:
##  $ pclass   : Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ...
##  $ survived : 'labelled' int  1 1 0 0 0 1 1 0 1 0 ...
##   ..- attr(*, "label")= chr "Survived"
##  $ name     : 'labelled' chr  "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Crei" ...
##   ..- attr(*, "label")= chr "Name"
##  $ sex      : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
##  $ age      : 'labelled' num  29 0.917 2 30 25 ...
##   ..- attr(*, "units")= chr "Year"
##   ..- attr(*, "label")= chr "Age"
##  $ sibsp    : 'labelled' int  0 1 1 1 1 0 1 0 2 0 ...
##   ..- attr(*, "label")= chr "Number of Siblings/Spouses Aboard"
##  $ parch    : 'labelled' int  0 2 2 2 2 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "Number of Parents/Children Aboard"
##  $ ticket   : 'labelled' chr  "24160" "113781" "113781" "113781" ...
##   ..- attr(*, "label")= chr "Ticket Number"
##  $ fare     : 'labelled' num  211 152 152 152 152 ...
##   ..- attr(*, "units")= chr "British Pound (\\243)"
##   ..- attr(*, "label")= chr "Passenger Fare"
##  $ cabin    : Factor w/ 187 levels "","A10","A11",..: 45 81 81 81 81 151 147 17 63 1 ...
##  $ embarked : Factor w/ 3 levels "Cherbourg","Queenstown",..: 3 3 3 3 3 3 3 3 3 1 ...
##  $ boat     : Factor w/ 28 levels "","1","10","11",..: 13 4 1 1 1 14 3 1 28 1 ...
##  $ body     : 'labelled' int  NA NA NA 135 NA NA NA NA NA 22 ...
##   ..- attr(*, "label")= chr "Body Identification Number"
##  $ home.dest: 'labelled' chr  "St Louis, MO" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" ...
##   ..- attr(*, "label")= chr "Home/Destination"

2.1 Select age, sex, class and fare that we will use for analysis

titanic2 <- titanic %>% select(age, sex, ticket, fare, pclass, sibsp, parch, survived)

2.2 Summary and Description of the data

summary(titanic2)

##       age              sex         ticket               fare         pclass   
##  Min.   : 0.1667   female:466   Length:1309        Min.   :  0.000   1st:323  
##  1st Qu.:21.0000   male  :843   Class :labelled    1st Qu.:  7.896   2nd:277  
##  Median :28.0000                Mode  :character   Median : 14.454   3rd:709  
##  Mean   :29.8811                                   Mean   : 33.295            
##  3rd Qu.:39.0000                                   3rd Qu.: 31.275            
##  Max.   :80.0000                                   Max.   :512.329            
##  NA's   :263                                       NA's   :1                  
##      sibsp            parch          survived    
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.000  
##  Median :0.0000   Median :0.000   Median :0.000  
##  Mean   :0.4989   Mean   :0.385   Mean   :0.382  
##  3rd Qu.:1.0000   3rd Qu.:0.000   3rd Qu.:1.000  
##  Max.   :8.0000   Max.   :9.000   Max.   :1.000  
##

describe(titanic2)

## titanic2 
## 
##  8  Variables      1309  Observations
## --------------------------------------------------------------------------------
## age : Age [Year] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1046      263       98    0.999    29.88    16.06        5       14 
##      .25      .50      .75      .90      .95 
##       21       28       39       50       57 
## 
## lowest :  0.1667  0.3333  0.4167  0.6667  0.7500
## highest: 70.5000 71.0000 74.0000 76.0000 80.0000
## --------------------------------------------------------------------------------
## sex 
##        n  missing distinct 
##     1309        0        2 
##                         
## Value      female   male
## Frequency     466    843
## Proportion  0.356  0.644
## --------------------------------------------------------------------------------
## ticket : Ticket Number 
##        n  missing distinct 
##     1309        0      929 
## 
## lowest : 110152      110413      110465      110469      110489     
## highest: W./C. 6608  W./C. 6609  W.E.P. 5734 W/C 14208   WE/P 5735  
## --------------------------------------------------------------------------------
## fare : Passenger Fare [British Pound (\243)] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1308        1      281        1     33.3    38.61    7.225    7.567 
##      .25      .50      .75      .90      .95 
##    7.896   14.454   31.275   78.051  133.650 
## 
## lowest :   0.0000   3.1708   4.0125   5.0000   6.2375
## highest: 227.5250 247.5208 262.3750 263.0000 512.3292
## --------------------------------------------------------------------------------
## pclass 
##        n  missing distinct 
##     1309        0        3 
##                             
## Value        1st   2nd   3rd
## Frequency    323   277   709
## Proportion 0.247 0.212 0.542
## --------------------------------------------------------------------------------
## sibsp : Number of Siblings/Spouses Aboard 
##        n  missing distinct     Info     Mean      Gmd 
##     1309        0        7     0.67   0.4989    0.777 
## 
## lowest : 0 1 2 3 4, highest: 2 3 4 5 8
##                                                     
## Value          0     1     2     3     4     5     8
## Frequency    891   319    42    20    22     6     9
## Proportion 0.681 0.244 0.032 0.015 0.017 0.005 0.007
## --------------------------------------------------------------------------------
## parch : Number of Parents/Children Aboard 
##        n  missing distinct     Info     Mean      Gmd 
##     1309        0        8    0.549    0.385   0.6375 
## 
## lowest : 0 1 2 3 4, highest: 3 4 5 6 9
##                                                           
## Value          0     1     2     3     4     5     6     9
## Frequency   1002   170   113     8     6     6     2     2
## Proportion 0.765 0.130 0.086 0.006 0.005 0.005 0.002 0.002
## --------------------------------------------------------------------------------
## survived : Survived 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1309        0        2    0.708      500    0.382   0.4725 
## 
## --------------------------------------------------------------------------------

2.3 Check the survival marginal distributions

ggplot(titanic, aes(as.factor(survived), fill = as.factor(survived))) +
  geom_bar(width = 0.6) +
  theme(legend.position = "none") +
  scale_x_discrete(labels = c ("No", "Yes")) +
  xlab("survived") + 
  ggtitle("The survival distributions")

2.4 The survival marginal according to gender

ggplot(data = titanic, mapping = aes(x = sex, fill = as.factor(survived))) +
  geom_bar(width = 0.6) +
  guides(fill = guide_legend(title = "survived")) +
  ggtitle("The survival marginal distribution according to gender")

2.5 The survival marginal distributions according to class

ggplot(data = titanic, mapping = aes(x = pclass, fill = as.factor(survived))) +
  geom_bar(width = 0.6) +
  guides(fill = guide_legend(title = "survived")) +
  ggtitle("The survival marginal distritution according to class")

2.6 Investigate trends

ggplot(titanic,aes(x=age,y=survived)) +
  geom_smooth(method="loess") +
  ylim (0,1) # it is obvious that there is not a linear relationship

## Don't know how to automatically pick scale for object of type <labelled>.
## Defaulting to continuous.
## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 263 rows containing non-finite values (`stat_smooth()`).

2.7 Does sex make the effect of age more obvious?

ggplot(titanic,aes(x=age,y=survived,col=sex)) +
  geom_smooth(method="loess") +
  ylim (0,1)

## Don't know how to automatically pick scale for object of type <labelled>.
## Defaulting to continuous.
## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 263 rows containing non-finite values (`stat_smooth()`).

The plot is showing the relationship between age and survival, grouped by sex. The trendline shows the general relationship between age and survival, with higher survival rates among younger passengers and lower survival rates among older passengers. The trendline is different for males and females, which suggests that the relationship between age and survival varies by sex.

When examining each sex separately, the effect of age is more noticeable. Specifically, older females are more likely to survive than younger females, whereas the opposite is true for males.

2.8 The relationship between age and survival, grouped by passenger class

ggplot(titanic, aes(x = age, y = survived, col = pclass)) + geom_smooth(method = "loess") + ylim (0,1)

## Don't know how to automatically pick scale for object of type <labelled>.
## Defaulting to continuous.
## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 263 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 5 rows containing missing values (`geom_smooth()`).

2.9 Split the passenger class and age groups by gender

ggplot(titanic, aes(x = age, y = survived, col = pclass)) + facet_grid(~sex) + geom_smooth(method = "loess") + ylim (0,1)

## Don't know how to automatically pick scale for object of type <labelled>.
## Defaulting to continuous.
## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 263 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 23 rows containing missing values (`geom_smooth()`).

# --> face_grid: this function used to create separate plots for each combination of sex and pclass

2.10 The impact of fare

ggplot(titanic, aes(x = fare, y = survived)) + geom_smooth(method = "loess") + ylim(0,1)

## Don't know how to automatically pick scale for object of type <labelled>.
## Defaulting to continuous.
## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).

# --> positive impact on the survival

2.11 Histogram

ggplot(data = titanic, mapping = aes(fare, fill = fare)) + geom_histogram()

## Don't know how to automatically pick scale for object of type <labelled>.
## Defaulting to continuous.
## Don't know how to automatically pick scale for object of type <labelled>.
## Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).

## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

ggplot(titanic,aes(fare)) + geom_histogram() + facet_grid(~pclass)

## Don't know how to automatically pick scale for object of type <labelled>.
## Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).

3. Checking for missings;

colSums(is.na(titanic))

##    pclass  survived      name       sex       age     sibsp     parch    ticket 
##         0         0         0         0       263         0         0         0 
##      fare     cabin  embarked      boat      body home.dest 
##         1         0         2         0      1188         0

md.pattern(titanic)

##     pclass survived name sex sibsp parch ticket cabin boat home.dest fare
## 119      1        1    1   1     1     1      1     1    1         1    1
## 924      1        1    1   1     1     1      1     1    1         1    1
## 1        1        1    1   1     1     1      1     1    1         1    1
## 262      1        1    1   1     1     1      1     1    1         1    1
## 2        1        1    1   1     1     1      1     1    1         1    1
## 1        1        1    1   1     1     1      1     1    1         1    0
##          0        0    0   0     0     0      0     0    0         0    1
##     embarked age body     
## 119        1   1    1    0
## 924        1   1    0    1
## 1          1   0    1    1
## 262        1   0    0    2
## 2          0   1    0    2
## 1          1   1    1    1
##            2 263 1188 1454

md.pattern(titanic2)

##      sex ticket pclass sibsp parch survived fare age    
## 1045   1      1      1     1     1        1    1   1   0
## 263    1      1      1     1     1        1    1   0   1
## 1      1      1      1     1     1        1    0   1   1
##        0      0      0     0     0        0    1 263 264

3.1 Let’s look at the observation of the one missing value on fare.

titanic[which(is.na(titanic$fare)),]

##      pclass survived               name  sex  age sibsp parch ticket fare cabin
## 1226    3rd        0 Storey, Mr. Thomas male 60.5     0     0   3701   NA      
##         embarked boat body home.dest
## 1226 Southampton       261

titanic2[which(is.na(titanic2$fare)),]

##       age  sex ticket fare pclass sibsp parch survived
## 1226 60.5 male   3701   NA    3rd     0     0        0

summary(aov(fare~pclass,data=titanic)) # --> It is statistically significant that the means of the fares differ amongst the classes.

##               Df  Sum Sq Mean Sq F value Pr(>F)    
## pclass         2 1272986  636493   372.7 <2e-16 ***
## Residuals   1305 2228415    1708                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness

TukeyHSD(aov(fare~pclass,data=titanic)) # --> Tukey test to see where the difference is

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = fare ~ pclass, data = titanic)
## 
## $pclass
##               diff       lwr        upr     p adj
## 2nd-1st -66.329795 -74.26990 -58.389693 0.0000000
## 3rd-1st -74.206103 -80.71643 -67.695772 0.0000000
## 3rd-2nd  -7.876308 -14.74783  -1.004781 0.0198199

# The difference between each class is statistically significant.

3.2 Missing value on age variable

summary(is.na(age) ~ survived, data = titanic)

## is.na(age)      N= 1309  
## 
## +--------+---+----+----------+
## |        |   |   N|is.na(age)|
## +--------+---+----+----------+
## |Survived| No| 809| 0.2348578|
## |        |Yes| 500| 0.1460000|
## +--------+---+----+----------+
## | Overall|   |1309| 0.2009167|
## +--------+---+----+----------+

age.na<-glm(is.na(age)~sex+pclass+survived+parch+sibsp,data=titanic)

summary(age.na)

## 
## Call:
## glm(formula = is.na(age) ~ sex + pclass + survived + parch + 
##     sibsp, data = titanic)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3222  -0.2845  -0.1228  -0.0410   1.0389  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.157060   0.036134   4.347 1.49e-05 ***
## sexmale     -0.002480   0.026931  -0.092  0.92665    
## pclass2nd   -0.069131   0.031976  -2.162  0.03080 *  
## pclass3rd    0.161677   0.027340   5.914 4.27e-09 ***
## survived    -0.034264   0.027185  -1.260  0.20775    
## parch       -0.039656   0.013551  -2.927  0.00349 ** 
## sibsp        0.001743   0.011134   0.157  0.87561    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1493451)
## 
##     Null deviance: 210.16  on 1308  degrees of freedom
## Residual deviance: 194.45  on 1302  degrees of freedom
## AIC: 1234.7
## 
## Number of Fisher Scoring iterations: 2

4. Splitting data

## Make it reproducible
set.seed(13032019)
#use 80% of dataset as training set and 20% as test set
sample <- sample(c(TRUE, FALSE), nrow(titanic2), replace=TRUE, prob=c(0.8,0.2))
train.titanic  <- titanic2[sample, ]
test.titanic   <- titanic2[!sample, ]

md.pattern(test.titanic)

##     sex ticket fare pclass sibsp parch survived age   
## 210   1      1    1      1     1     1        1   1  0
## 53    1      1    1      1     1     1        1   0  1
##       0      0    0      0     0     0        0  53 53

md.pattern(train.titanic)

##     sex ticket pclass sibsp parch survived fare age    
## 835   1      1      1     1     1        1    1   1   0
## 210   1      1      1     1     1        1    1   0   1
## 1     1      1      1     1     1        1    0   1   1
##       0      0      0     0     0        0    1 210 211

4.1 imputing age into the training set

set.seed(13032019)

imputed_data<-mice(test.titanic,m=5,maxit = 50,method = 'rf')

## 
##  iter imp variable
##   1   1  age
##   1   2  age
##   1   3  age
##   1   4  age
##   1   5  age
##   2   1  age
##   2   2  age
##   2   3  age
##   2   4  age
##   2   5  age
##   3   1  age
##   3   2  age
##   3   3  age
##   3   4  age
##   3   5  age
##   4   1  age
##   4   2  age
##   4   3  age
##   4   4  age
##   4   5  age
##   5   1  age
##   5   2  age
##   5   3  age
##   5   4  age
##   5   5  age
##   6   1  age
##   6   2  age
##   6   3  age
##   6   4  age
##   6   5  age
##   7   1  age
##   7   2  age
##   7   3  age
##   7   4  age
##   7   5  age
##   8   1  age
##   8   2  age
##   8   3  age
##   8   4  age
##   8   5  age
##   9   1  age
##   9   2  age
##   9   3  age
##   9   4  age
##   9   5  age
##   10   1  age
##   10   2  age
##   10   3  age
##   10   4  age
##   10   5  age
##   11   1  age
##   11   2  age
##   11   3  age
##   11   4  age
##   11   5  age
##   12   1  age
##   12   2  age
##   12   3  age
##   12   4  age
##   12   5  age
##   13   1  age
##   13   2  age
##   13   3  age
##   13   4  age
##   13   5  age
##   14   1  age
##   14   2  age
##   14   3  age
##   14   4  age
##   14   5  age
##   15   1  age
##   15   2  age
##   15   3  age
##   15   4  age
##   15   5  age
##   16   1  age
##   16   2  age
##   16   3  age
##   16   4  age
##   16   5  age
##   17   1  age
##   17   2  age
##   17   3  age
##   17   4  age
##   17   5  age
##   18   1  age
##   18   2  age
##   18   3  age
##   18   4  age
##   18   5  age
##   19   1  age
##   19   2  age
##   19   3  age
##   19   4  age
##   19   5  age
##   20   1  age
##   20   2  age
##   20   3  age
##   20   4  age
##   20   5  age
##   21   1  age
##   21   2  age
##   21   3  age
##   21   4  age
##   21   5  age
##   22   1  age
##   22   2  age
##   22   3  age
##   22   4  age
##   22   5  age
##   23   1  age
##   23   2  age
##   23   3  age
##   23   4  age
##   23   5  age
##   24   1  age
##   24   2  age
##   24   3  age
##   24   4  age
##   24   5  age
##   25   1  age
##   25   2  age
##   25   3  age
##   25   4  age
##   25   5  age
##   26   1  age
##   26   2  age
##   26   3  age
##   26   4  age
##   26   5  age
##   27   1  age
##   27   2  age
##   27   3  age
##   27   4  age
##   27   5  age
##   28   1  age
##   28   2  age
##   28   3  age
##   28   4  age
##   28   5  age
##   29   1  age
##   29   2  age
##   29   3  age
##   29   4  age
##   29   5  age
##   30   1  age
##   30   2  age
##   30   3  age
##   30   4  age
##   30   5  age
##   31   1  age
##   31   2  age
##   31   3  age
##   31   4  age
##   31   5  age
##   32   1  age
##   32   2  age
##   32   3  age
##   32   4  age
##   32   5  age
##   33   1  age
##   33   2  age
##   33   3  age
##   33   4  age
##   33   5  age
##   34   1  age
##   34   2  age
##   34   3  age
##   34   4  age
##   34   5  age
##   35   1  age
##   35   2  age
##   35   3  age
##   35   4  age
##   35   5  age
##   36   1  age
##   36   2  age
##   36   3  age
##   36   4  age
##   36   5  age
##   37   1  age
##   37   2  age
##   37   3  age
##   37   4  age
##   37   5  age
##   38   1  age
##   38   2  age
##   38   3  age
##   38   4  age
##   38   5  age
##   39   1  age
##   39   2  age
##   39   3  age
##   39   4  age
##   39   5  age
##   40   1  age
##   40   2  age
##   40   3  age
##   40   4  age
##   40   5  age
##   41   1  age
##   41   2  age
##   41   3  age
##   41   4  age
##   41   5  age
##   42   1  age
##   42   2  age
##   42   3  age
##   42   4  age
##   42   5  age
##   43   1  age
##   43   2  age
##   43   3  age
##   43   4  age
##   43   5  age
##   44   1  age
##   44   2  age
##   44   3  age
##   44   4  age
##   44   5  age
##   45   1  age
##   45   2  age
##   45   3  age
##   45   4  age
##   45   5  age
##   46   1  age
##   46   2  age
##   46   3  age
##   46   4  age
##   46   5  age
##   47   1  age
##   47   2  age
##   47   3  age
##   47   4  age
##   47   5  age
##   48   1  age
##   48   2  age
##   48   3  age
##   48   4  age
##   48   5  age
##   49   1  age
##   49   2  age
##   49   3  age
##   49   4  age
##   49   5  age
##   50   1  age
##   50   2  age
##   50   3  age
##   50   4  age
##   50   5  age

## Warning: Number of logged events: 1

mean(titanic$age,na.rm = TRUE)     ##the original data's mean value

## [1] 29.88113

lapply(imputed_data$imp$age,mean)  ##each imputed set's mean

## $`1`
## [1] 24.62264
## 
## $`2`
## [1] 29
## 
## $`3`
## [1] 30.32075
## 
## $`4`
## [1] 25.5
## 
## $`5`
## [1] 25.5566

sd(titanic$age,na.rm = TRUE)     ##the original data's standard deviation value

## [1] 14.4135

lapply(imputed_data$imp$age,sd)  ##each imputed set's standard deviation

## $`1`
## [1] 8.792428
## 
## $`2`
## [1] 12.72868
## 
## $`3`
## [1] 13.86962
## 
## $`4`
## [1] 10.80064
## 
## $`5`
## [1] 9.718965

# So we will choose the 2nd set
titanic.train.complete<-complete(imputed_data, 3)

##assign the survived variable from the test.titanic to to "actual"
actual <- test.titanic$survived
##drop it from the dataset
test.titanic <- select(test.titanic,-survived)

4.2 imputing age into the testing data

set.seed(13032019)
imputed_data.test<-mice(test.titanic,m=5,maxit = 50,method = 'rf')

## 
##  iter imp variable
##   1   1  age
##   1   2  age
##   1   3  age
##   1   4  age
##   1   5  age
##   2   1  age
##   2   2  age
##   2   3  age
##   2   4  age
##   2   5  age
##   3   1  age
##   3   2  age
##   3   3  age
##   3   4  age
##   3   5  age
##   4   1  age
##   4   2  age
##   4   3  age
##   4   4  age
##   4   5  age
##   5   1  age
##   5   2  age
##   5   3  age
##   5   4  age
##   5   5  age
##   6   1  age
##   6   2  age
##   6   3  age
##   6   4  age
##   6   5  age
##   7   1  age
##   7   2  age
##   7   3  age
##   7   4  age
##   7   5  age
##   8   1  age
##   8   2  age
##   8   3  age
##   8   4  age
##   8   5  age
##   9   1  age
##   9   2  age
##   9   3  age
##   9   4  age
##   9   5  age
##   10   1  age
##   10   2  age
##   10   3  age
##   10   4  age
##   10   5  age
##   11   1  age
##   11   2  age
##   11   3  age
##   11   4  age
##   11   5  age
##   12   1  age
##   12   2  age
##   12   3  age
##   12   4  age
##   12   5  age
##   13   1  age
##   13   2  age
##   13   3  age
##   13   4  age
##   13   5  age
##   14   1  age
##   14   2  age
##   14   3  age
##   14   4  age
##   14   5  age
##   15   1  age
##   15   2  age
##   15   3  age
##   15   4  age
##   15   5  age
##   16   1  age
##   16   2  age
##   16   3  age
##   16   4  age
##   16   5  age
##   17   1  age
##   17   2  age
##   17   3  age
##   17   4  age
##   17   5  age
##   18   1  age
##   18   2  age
##   18   3  age
##   18   4  age
##   18   5  age
##   19   1  age
##   19   2  age
##   19   3  age
##   19   4  age
##   19   5  age
##   20   1  age
##   20   2  age
##   20   3  age
##   20   4  age
##   20   5  age
##   21   1  age
##   21   2  age
##   21   3  age
##   21   4  age
##   21   5  age
##   22   1  age
##   22   2  age
##   22   3  age
##   22   4  age
##   22   5  age
##   23   1  age
##   23   2  age
##   23   3  age
##   23   4  age
##   23   5  age
##   24   1  age
##   24   2  age
##   24   3  age
##   24   4  age
##   24   5  age
##   25   1  age
##   25   2  age
##   25   3  age
##   25   4  age
##   25   5  age
##   26   1  age
##   26   2  age
##   26   3  age
##   26   4  age
##   26   5  age
##   27   1  age
##   27   2  age
##   27   3  age
##   27   4  age
##   27   5  age
##   28   1  age
##   28   2  age
##   28   3  age
##   28   4  age
##   28   5  age
##   29   1  age
##   29   2  age
##   29   3  age
##   29   4  age
##   29   5  age
##   30   1  age
##   30   2  age
##   30   3  age
##   30   4  age
##   30   5  age
##   31   1  age
##   31   2  age
##   31   3  age
##   31   4  age
##   31   5  age
##   32   1  age
##   32   2  age
##   32   3  age
##   32   4  age
##   32   5  age
##   33   1  age
##   33   2  age
##   33   3  age
##   33   4  age
##   33   5  age
##   34   1  age
##   34   2  age
##   34   3  age
##   34   4  age
##   34   5  age
##   35   1  age
##   35   2  age
##   35   3  age
##   35   4  age
##   35   5  age
##   36   1  age
##   36   2  age
##   36   3  age
##   36   4  age
##   36   5  age
##   37   1  age
##   37   2  age
##   37   3  age
##   37   4  age
##   37   5  age
##   38   1  age
##   38   2  age
##   38   3  age
##   38   4  age
##   38   5  age
##   39   1  age
##   39   2  age
##   39   3  age
##   39   4  age
##   39   5  age
##   40   1  age
##   40   2  age
##   40   3  age
##   40   4  age
##   40   5  age
##   41   1  age
##   41   2  age
##   41   3  age
##   41   4  age
##   41   5  age
##   42   1  age
##   42   2  age
##   42   3  age
##   42   4  age
##   42   5  age
##   43   1  age
##   43   2  age
##   43   3  age
##   43   4  age
##   43   5  age
##   44   1  age
##   44   2  age
##   44   3  age
##   44   4  age
##   44   5  age
##   45   1  age
##   45   2  age
##   45   3  age
##   45   4  age
##   45   5  age
##   46   1  age
##   46   2  age
##   46   3  age
##   46   4  age
##   46   5  age
##   47   1  age
##   47   2  age
##   47   3  age
##   47   4  age
##   47   5  age
##   48   1  age
##   48   2  age
##   48   3  age
##   48   4  age
##   48   5  age
##   49   1  age
##   49   2  age
##   49   3  age
##   49   4  age
##   49   5  age
##   50   1  age
##   50   2  age
##   50   3  age
##   50   4  age
##   50   5  age

## Warning: Number of logged events: 1

mean(titanic$age, na.rm = TRUE)    ##the original data's mean value

## [1] 29.88113

lapply(imputed_data.test$imp$age,mean) ##each imputed set's mean

## $`1`
## [1] 29.57547
## 
## $`2`
## [1] 24.76415
## 
## $`3`
## [1] 26.62736
## 
## $`4`
## [1] 24.76101
## 
## $`5`
## [1] 25.96226

sd(titanic$age, na.rm = TRUE)      ##the original data's standard deviation value

## [1] 14.4135

lapply(imputed_data.test$imp$age,sd)  ####each imputed set's standard deviation

## $`1`
## [1] 12.62606
## 
## $`2`
## [1] 7.811852
## 
## $`3`
## [1] 11.43624
## 
## $`4`
## [1] 10.28185
## 
## $`5`
## [1] 11.60398

titanic.test.complete<-complete(imputed_data.test, 1)

4.3 Change the sibsp and parch variables to factor

titanic.train.complete$sibsp<-as.factor(titanic.train.complete$sibsp)
titanic.train.complete$parch<-as.factor(titanic.train.complete$parch)
titanic.test.complete$sibsp<-as.factor(titanic.test.complete$sibsp)
titanic.test.complete$parch<-as.factor(titanic.test.complete$parch)

5. Prediction model: Fitting and testing the model

plot(summary(survived~sex+pclass+age+parch+sibsp,data=titanic3))

dd <- datadist (titanic.train.complete)
options(datadist = 'dd')

5.1 First model

fit1<-lrm(survived ~ age+sex+pclass+sibsp+parch+fare,data = titanic.train.complete)
fit1

## Logistic Regression Model
##  
##  lrm(formula = survived ~ age + sex + pclass + sibsp + parch + 
##      fare, data = titanic.train.complete)
##  
##                        Model Likelihood       Discrimination    Rank Discrim.    
##                              Ratio Test              Indexes          Indexes    
##  Obs          263    LR chi2     128.68       R2       0.520    C       0.868    
##   0           152    d.f.            16      R2(16,263)0.348    Dxy     0.735    
##   1           111    Pr(> chi2) <0.0001    R2(16,192.5)0.443    gamma   0.736    
##  max |deriv| 0.06                             Brier    0.141    tau-a   0.360    
##  
##             Coef     S.E.    Wald Z Pr(>|Z|)
##  Intercept    5.0284  0.9145  5.50  <0.0001 
##  age         -0.0586  0.0155 -3.77  0.0002  
##  sex=male    -2.2225  0.3581 -6.21  <0.0001 
##  pclass=2nd  -1.9852  0.5756 -3.45  0.0006  
##  pclass=3rd  -3.2222  0.5996 -5.37  <0.0001 
##  sibsp=1     -0.7113  0.4382 -1.62  0.1045  
##  sibsp=2     -0.9224  1.1272 -0.82  0.4132  
##  sibsp=3    -10.2403 40.0022 -0.26  0.7980  
##  sibsp=4     -2.7062  1.3723 -1.97  0.0486  
##  sibsp=5     -8.5961 61.5411 -0.14  0.8889  
##  sibsp=8     -7.5268 61.5406 -0.12  0.9027  
##  parch=1      1.1520  0.5653  2.04  0.0416  
##  parch=2      0.9823  0.8407  1.17  0.2427  
##  parch=3      9.1553 61.7992  0.15  0.8822  
##  parch=4     -6.7969 61.5367 -0.11  0.9120  
##  parch=5     -4.6652 61.5373 -0.08  0.9396  
##  fare        -0.0032  0.0045 -0.71  0.4747  
##

5.2 Second model

fit2<-lrm(survived ~ pclass+sex*rcs(age,3)+sibsp+parch+fare,data=titanic.train.complete)
fit2

## Logistic Regression Model
##  
##  lrm(formula = survived ~ pclass + sex * rcs(age, 3) + sibsp + 
##      parch + fare, data = titanic.train.complete)
##  
##                        Model Likelihood       Discrimination    Rank Discrim.    
##                              Ratio Test              Indexes          Indexes    
##  Obs          263    LR chi2     142.51       R2       0.562    C       0.882    
##   0           152    d.f.            19      R2(19,263)0.375    Dxy     0.764    
##   1           111    Pr(> chi2) <0.0001    R2(19,192.5)0.474    gamma   0.764    
##  max |deriv| 0.05                             Brier    0.134    tau-a   0.374    
##  
##                  Coef     S.E.    Wald Z Pr(>|Z|)
##  Intercept         5.2679  1.3062  4.03  <0.0001 
##  pclass=2nd       -2.8396  0.7405 -3.83  0.0001  
##  pclass=3rd       -3.9390  0.7418 -5.31  <0.0001 
##  sex=male          0.1059  1.4231  0.07  0.9407  
##  age              -0.0538  0.0468 -1.15  0.2507  
##  age'              0.0871  0.0883  0.99  0.3239  
##  sibsp=1          -0.7915  0.4612 -1.72  0.0862  
##  sibsp=2          -0.8756  1.0085 -0.87  0.3852  
##  sibsp=3         -10.4904 43.2428 -0.24  0.8083  
##  sibsp=4          -3.1534  1.3363 -2.36  0.0183  
##  sibsp=5         -10.4083 61.5493 -0.17  0.8657  
##  sibsp=8          -8.0061 61.5414 -0.13  0.8965  
##  parch=1           1.2378  0.6452  1.92  0.0550  
##  parch=2           1.2089  0.8847  1.37  0.1718  
##  parch=3           7.8746 61.8017  0.13  0.8986  
##  parch=4          -8.2853 61.5394 -0.13  0.8929  
##  parch=5          -3.6351 61.5388 -0.06  0.9529  
##  fare             -0.0073  0.0056 -1.31  0.1889  
##  sex=male * age   -0.0778  0.0654 -1.19  0.2336  
##  sex=male * age'  -0.0582  0.1258 -0.46  0.6435  
##

5.3 Third model

# Trying to increase the number of knots on restricted cubic age, eliminating the parch variable due to its insignificance, and defining interaction terms between pclass and sex.
fit3<-lrm(survived ~ sex*rcs(age,5) + sibsp + pclass*sex+fare,data=titanic.train.complete)
fit3

## Logistic Regression Model
##  
##  lrm(formula = survived ~ sex * rcs(age, 5) + sibsp + pclass * 
##      sex + fare, data = titanic.train.complete)
##  
##                       Model Likelihood       Discrimination    Rank Discrim.    
##                             Ratio Test              Indexes          Indexes    
##  Obs         263    LR chi2     151.12       R2       0.588    C       0.884    
##   0          152    d.f.            20      R2(20,263)0.393    Dxy     0.768    
##   1          111    Pr(> chi2) <0.0001    R2(20,192.5)0.494    gamma   0.768    
##  max |deriv| 0.5                             Brier    0.127    tau-a   0.376    
##  
##                        Coef     S.E.     Wald Z Pr(>|Z|)
##  Intercept              23.3049  31.9977  0.73  0.4664  
##  sex=male              -16.3895  31.8109 -0.52  0.6064  
##  age                    -0.0249   0.1022 -0.24  0.8076  
##  age'                   -0.8671   0.7608 -1.14  0.2544  
##  age''                  10.7499   7.4198  1.45  0.1474  
##  age'''                -17.4453  11.6177 -1.50  0.1332  
##  sibsp=1                -0.2438   0.5187 -0.47  0.6383  
##  sibsp=2                -0.3255   1.0317 -0.32  0.7524  
##  sibsp=3               -13.1538 192.4452 -0.07  0.9455  
##  sibsp=4                -2.3974   1.2926 -1.85  0.0636  
##  sibsp=5               -13.1330 275.7687 -0.05  0.9620  
##  sibsp=8                -8.9991 275.7655 -0.03  0.9740  
##  pclass=2nd            -18.7704  31.9265 -0.59  0.5566  
##  pclass=3rd            -21.3330  31.9562 -0.67  0.5044  
##  fare                   -0.0258   0.0156 -1.66  0.0969  
##  sex=male * age         -0.1951   0.1611 -1.21  0.2258  
##  sex=male * age'         1.1215   0.9947  1.13  0.2595  
##  sex=male * age''      -10.3224   9.2079 -1.12  0.2623  
##  sex=male * age'''      15.2048  14.3160  1.06  0.2882  
##  sex=male * pclass=2nd  15.6556  31.8079  0.49  0.6226  
##  sex=male * pclass=3rd  17.7430  31.8115  0.56  0.5770  
##

5.4 Fourth model

fit4<-lrm(survived~sex*rcs(age,5)+ sibsp + pclass*sex,data=titanic.train.complete)
fit4

## Logistic Regression Model
##  
##  lrm(formula = survived ~ sex * rcs(age, 5) + sibsp + pclass * 
##      sex, data = titanic.train.complete)
##  
##                        Model Likelihood       Discrimination    Rank Discrim.    
##                              Ratio Test              Indexes          Indexes    
##  Obs          263    LR chi2     147.66       R2       0.578    C       0.879    
##   0           152    d.f.            19      R2(19,263)0.387    Dxy     0.758    
##   1           111    Pr(> chi2) <0.0001    R2(19,192.5)0.488    gamma   0.762    
##  max |deriv| 0.06                             Brier    0.131    tau-a   0.371    
##  
##                        Coef     S.E.     Wald Z Pr(>|Z|)
##  Intercept              13.1778  27.4628  0.48  0.6313  
##  sex=male               -7.6017  27.5221 -0.28  0.7824  
##  age                    -0.0322   0.1022 -0.32  0.7525  
##  age'                   -0.8413   0.7611 -1.11  0.2690  
##  age''                  10.5137   7.4349  1.41  0.1573  
##  age'''                -17.0725  11.6639 -1.46  0.1433  
##  sibsp=1                -0.6675   0.4753 -1.40  0.1603  
##  sibsp=2                -0.7980   0.9844 -0.81  0.4175  
##  sibsp=3               -12.5523 116.9498 -0.11  0.9145  
##  sibsp=4                -2.9065   1.3039 -2.23  0.0258  
##  sibsp=5               -13.1022 167.2688 -0.08  0.9376  
##  sibsp=8                -9.6145 167.2603 -0.06  0.9542  
##  pclass=2nd             -8.8170  27.4348 -0.32  0.7479  
##  pclass=3rd            -11.2348  27.4352 -0.41  0.6822  
##  sex=male * age         -0.1826   0.1644 -1.11  0.2667  
##  sex=male * age'         1.0593   1.0005  1.06  0.2897  
##  sex=male * age''       -9.5153   9.1807 -1.04  0.3000  
##  sex=male * age'''      13.7797  14.2408  0.97  0.3332  
##  sex=male * pclass=2nd   6.6230  27.4456  0.24  0.8093  
##  sex=male * pclass=3rd   8.7366  27.4436  0.32  0.7502  
##

6. ConfusionMatrix

anova(fit4)

##                 Wald Statistics          Response: survived 
## 
##  Factor                                      Chi-Square d.f. P     
##  sex  (Factor+Higher Order Factors)          26.50       7   0.0004
##   All Interactions                           10.76       6   0.0961
##  age  (Factor+Higher Order Factors)          23.96       8   0.0023
##   All Interactions                            3.14       4   0.5346
##   Nonlinear (Factor+Higher Order Factors)     6.02       6   0.4206
##  sibsp                                        6.19       6   0.4022
##  pclass  (Factor+Higher Order Factors)       22.11       4   0.0002
##   All Interactions                            5.04       2   0.0803
##  sex * age  (Factor+Higher Order Factors)     3.14       4   0.5346
##   Nonlinear                                   1.22       3   0.7485
##   Nonlinear Interaction : f(A,B) vs. AB       1.22       3   0.7485
##  sex * pclass  (Factor+Higher Order Factors)  5.04       2   0.0803
##  TOTAL NONLINEAR                              6.02       6   0.4206
##  TOTAL INTERACTION                           10.76       6   0.0961
##  TOTAL NONLINEAR + INTERACTION               14.45       9   0.1072
##  TOTAL                                       47.34      19   0.0003

ggplot(Predict(fit4, age , sex , pclass, sibsp=0 , fun=plogis))

test<-predict(fit4, titanic.test.complete)             ## apply the model to the test data
test<-ifelse(test > 0.5, 1,0)                            ## convert the probability value to survive or not.
library(caret)

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

## The following object is masked from 'package:purrr':
## 
##     lift

confusionMatrix(table(test,actual), positive = "1")   ## compare the model's output to the actual data

## Confusion Matrix and Statistics
## 
##     actual
## test   0   1
##    0 142  38
##    1  10  73
##                                           
##                Accuracy : 0.8175          
##                  95% CI : (0.7654, 0.8623)
##     No Information Rate : 0.5779          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6127          
##                                           
##  Mcnemar's Test P-Value : 9.735e-05       
##                                           
##             Sensitivity : 0.6577          
##             Specificity : 0.9342          
##          Pos Pred Value : 0.8795          
##          Neg Pred Value : 0.7889          
##              Prevalence : 0.4221          
##          Detection Rate : 0.2776          
##    Detection Prevalence : 0.3156          
##       Balanced Accuracy : 0.7959          
##                                           
##        'Positive' Class : 1               
##

https://rpubs.com/Mahmoud2620/1015400

ML - Logistic Regression

Vu Thien

2023-03-15