library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
library(rms)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
##
## The following object is masked from 'package:base':
##
## backsolve
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
## Getting the data from Hmisc and rms libraries getHdata(titanic3)
## assign it to a variable called “titanic” titanic<-titanic3
getHdata(titanic3)
titanic <- titanic3
# Exploration the data
glimpse(titanic)
## Rows: 1,309
## Columns: 14
## $ pclass <fct> 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, 1st, …
## $ survived <labelled> 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1…
## $ name <labelled> "Allen, Miss. Elisabeth Walton", "Allison, Master. Huds…
## $ sex <fct> female, male, female, male, female, male, female, male, fema…
## $ age <labelled> 29.0000, 0.9167, 2.0000, 30.0000, 25.0000, 48.0000, 63.…
## $ sibsp <labelled> 0, 1, 1, 1, 1, 0, 1, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0…
## $ parch <labelled> 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0…
## $ ticket <labelled> "24160", "113781", "113781", "113781", "113781", "19952…
## $ fare <labelled> 211.3375, 151.5500, 151.5500, 151.5500, 151.5500, 26.55…
## $ cabin <fct> B5, C22 C26, C22 C26, C22 C26, C22 C26, E12, D7, A36, C101, …
## $ embarked <fct> Southampton, Southampton, Southampton, Southampton, Southamp…
## $ boat <fct> 2, 11, , , , 3, 10, , D, , , 4, 9, 6, B, , , 6, 8, A, 5, 5, …
## $ body <labelled> NA, NA, NA, 135, NA, NA, NA, NA, NA, 22, 124, NA, NA, N…
## $ home.dest <labelled> "St Louis, MO", "Montreal, PQ / Chesterville, ON", "Mon…
dim(titanic)
## [1] 1309 14
head(titanic, 10)
## pclass survived name sex age sibsp parch
## 1 1st 1 Allen, Miss. Elisabeth Walton female 29.0000 0 0
## 2 1st 1 Allison, Master. Hudson Trevor male 0.9167 1 2
## 3 1st 0 Allison, Miss. Helen Loraine female 2.0000 1 2
## 4 1st 0 Allison, Mr. Hudson Joshua Crei male 30.0000 1 2
## 5 1st 0 Allison, Mrs. Hudson J C (Bessi female 25.0000 1 2
## 6 1st 1 Anderson, Mr. Harry male 48.0000 0 0
## 7 1st 1 Andrews, Miss. Kornelia Theodos female 63.0000 1 0
## 8 1st 0 Andrews, Mr. Thomas Jr male 39.0000 0 0
## 9 1st 1 Appleton, Mrs. Edward Dale (Cha female 53.0000 2 0
## 10 1st 0 Artagaveytia, Mr. Ramon male 71.0000 0 0
## ticket fare cabin embarked boat body
## 1 24160 211.3375 B5 Southampton 2 NA
## 2 113781 151.5500 C22 C26 Southampton 11 NA
## 3 113781 151.5500 C22 C26 Southampton NA
## 4 113781 151.5500 C22 C26 Southampton 135
## 5 113781 151.5500 C22 C26 Southampton NA
## 6 19952 26.5500 E12 Southampton 3 NA
## 7 13502 77.9583 D7 Southampton 10 NA
## 8 112050 0.0000 A36 Southampton NA
## 9 11769 51.4792 C101 Southampton D NA
## 10 PC 17609 49.5042 Cherbourg 22
## home.dest
## 1 St Louis, MO
## 2 Montreal, PQ / Chesterville, ON
## 3 Montreal, PQ / Chesterville, ON
## 4 Montreal, PQ / Chesterville, ON
## 5 Montreal, PQ / Chesterville, ON
## 6 New York, NY
## 7 Hudson, NY
## 8 Belfast, NI
## 9 Bayside, Queens, NY
## 10 Montevideo, Uruguay
tail(titanic, 10)
## pclass survived name sex age sibsp parch
## 1300 3rd 0 Yasbeck, Mr. Antoni male 27.0 1 0
## 1301 3rd 1 Yasbeck, Mrs. Antoni (Selini Al female 15.0 1 0
## 1302 3rd 0 Youseff, Mr. Gerious male 45.5 0 0
## 1303 3rd 0 Yousif, Mr. Wazli male NA 0 0
## 1304 3rd 0 Yousseff, Mr. Gerious male NA 0 0
## 1305 3rd 0 Zabour, Miss. Hileni female 14.5 1 0
## 1306 3rd 0 Zabour, Miss. Thamine female NA 1 0
## 1307 3rd 0 Zakarian, Mr. Mapriededer male 26.5 0 0
## 1308 3rd 0 Zakarian, Mr. Ortin male 27.0 0 0
## 1309 3rd 0 Zimmerman, Mr. Leo male 29.0 0 0
## ticket fare cabin embarked boat body home.dest
## 1300 2659 14.4542 Cherbourg C NA
## 1301 2659 14.4542 Cherbourg NA
## 1302 2628 7.2250 Cherbourg 312
## 1303 2647 7.2250 Cherbourg NA
## 1304 2627 14.4583 Cherbourg NA
## 1305 2665 14.4542 Cherbourg 328
## 1306 2665 14.4542 Cherbourg NA
## 1307 2656 7.2250 Cherbourg 304
## 1308 2670 7.2250 Cherbourg NA
## 1309 315082 7.8750 Southampton NA
str(titanic)
## 'data.frame': 1309 obs. of 14 variables:
## $ pclass : Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ...
## $ survived : 'labelled' int 1 1 0 0 0 1 1 0 1 0 ...
## ..- attr(*, "label")= chr "Survived"
## $ name : 'labelled' chr "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Crei" ...
## ..- attr(*, "label")= chr "Name"
## $ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
## $ age : 'labelled' num 29 0.917 2 30 25 ...
## ..- attr(*, "units")= chr "Year"
## ..- attr(*, "label")= chr "Age"
## $ sibsp : 'labelled' int 0 1 1 1 1 0 1 0 2 0 ...
## ..- attr(*, "label")= chr "Number of Siblings/Spouses Aboard"
## $ parch : 'labelled' int 0 2 2 2 2 0 0 0 0 0 ...
## ..- attr(*, "label")= chr "Number of Parents/Children Aboard"
## $ ticket : 'labelled' chr "24160" "113781" "113781" "113781" ...
## ..- attr(*, "label")= chr "Ticket Number"
## $ fare : 'labelled' num 211 152 152 152 152 ...
## ..- attr(*, "units")= chr "British Pound (\\243)"
## ..- attr(*, "label")= chr "Passenger Fare"
## $ cabin : Factor w/ 187 levels "","A10","A11",..: 45 81 81 81 81 151 147 17 63 1 ...
## $ embarked : Factor w/ 3 levels "Cherbourg","Queenstown",..: 3 3 3 3 3 3 3 3 3 1 ...
## $ boat : Factor w/ 28 levels "","1","10","11",..: 13 4 1 1 1 14 3 1 28 1 ...
## $ body : 'labelled' int NA NA NA 135 NA NA NA NA NA 22 ...
## ..- attr(*, "label")= chr "Body Identification Number"
## $ home.dest: 'labelled' chr "St Louis, MO" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" ...
## ..- attr(*, "label")= chr "Home/Destination"
titanic2 <- titanic %>% select(age, sex, ticket, fare, pclass, sibsp, parch, survived)
summary(titanic2)
## age sex ticket fare pclass
## Min. : 0.1667 female:466 Length:1309 Min. : 0.000 1st:323
## 1st Qu.:21.0000 male :843 Class :labelled 1st Qu.: 7.896 2nd:277
## Median :28.0000 Mode :character Median : 14.454 3rd:709
## Mean :29.8811 Mean : 33.295
## 3rd Qu.:39.0000 3rd Qu.: 31.275
## Max. :80.0000 Max. :512.329
## NA's :263 NA's :1
## sibsp parch survived
## Min. :0.0000 Min. :0.000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.000
## Median :0.0000 Median :0.000 Median :0.000
## Mean :0.4989 Mean :0.385 Mean :0.382
## 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.:1.000
## Max. :8.0000 Max. :9.000 Max. :1.000
##
describe(titanic2)
## titanic2
##
## 8 Variables 1309 Observations
## --------------------------------------------------------------------------------
## age : Age [Year]
## n missing distinct Info Mean Gmd .05 .10
## 1046 263 98 0.999 29.88 16.06 5 14
## .25 .50 .75 .90 .95
## 21 28 39 50 57
##
## lowest : 0.1667 0.3333 0.4167 0.6667 0.7500
## highest: 70.5000 71.0000 74.0000 76.0000 80.0000
## --------------------------------------------------------------------------------
## sex
## n missing distinct
## 1309 0 2
##
## Value female male
## Frequency 466 843
## Proportion 0.356 0.644
## --------------------------------------------------------------------------------
## ticket : Ticket Number
## n missing distinct
## 1309 0 929
##
## lowest : 110152 110413 110465 110469 110489
## highest: W./C. 6608 W./C. 6609 W.E.P. 5734 W/C 14208 WE/P 5735
## --------------------------------------------------------------------------------
## fare : Passenger Fare [British Pound (\243)]
## n missing distinct Info Mean Gmd .05 .10
## 1308 1 281 1 33.3 38.61 7.225 7.567
## .25 .50 .75 .90 .95
## 7.896 14.454 31.275 78.051 133.650
##
## lowest : 0.0000 3.1708 4.0125 5.0000 6.2375
## highest: 227.5250 247.5208 262.3750 263.0000 512.3292
## --------------------------------------------------------------------------------
## pclass
## n missing distinct
## 1309 0 3
##
## Value 1st 2nd 3rd
## Frequency 323 277 709
## Proportion 0.247 0.212 0.542
## --------------------------------------------------------------------------------
## sibsp : Number of Siblings/Spouses Aboard
## n missing distinct Info Mean Gmd
## 1309 0 7 0.67 0.4989 0.777
##
## lowest : 0 1 2 3 4, highest: 2 3 4 5 8
##
## Value 0 1 2 3 4 5 8
## Frequency 891 319 42 20 22 6 9
## Proportion 0.681 0.244 0.032 0.015 0.017 0.005 0.007
## --------------------------------------------------------------------------------
## parch : Number of Parents/Children Aboard
## n missing distinct Info Mean Gmd
## 1309 0 8 0.549 0.385 0.6375
##
## lowest : 0 1 2 3 4, highest: 3 4 5 6 9
##
## Value 0 1 2 3 4 5 6 9
## Frequency 1002 170 113 8 6 6 2 2
## Proportion 0.765 0.130 0.086 0.006 0.005 0.005 0.002 0.002
## --------------------------------------------------------------------------------
## survived : Survived
## n missing distinct Info Sum Mean Gmd
## 1309 0 2 0.708 500 0.382 0.4725
##
## --------------------------------------------------------------------------------
ggplot(titanic, aes(as.factor(survived), fill = as.factor(survived))) +
geom_bar(width = 0.6) +
theme(legend.position = "none") +
scale_x_discrete(labels = c ("No", "Yes")) +
xlab("survived") +
ggtitle("The survival distributions")
ggplot(data = titanic, mapping = aes(x = sex, fill = as.factor(survived))) +
geom_bar(width = 0.6) +
guides(fill = guide_legend(title = "survived")) +
ggtitle("The survival marginal distribution according to gender")
ggplot(data = titanic, mapping = aes(x = pclass, fill = as.factor(survived))) +
geom_bar(width = 0.6) +
guides(fill = guide_legend(title = "survived")) +
ggtitle("The survival marginal distritution according to class")
ggplot(titanic,aes(x=age,y=survived)) +
geom_smooth(method="loess") +
ylim (0,1) # it is obvious that there is not a linear relationship
## Don't know how to automatically pick scale for object of type <labelled>.
## Defaulting to continuous.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 263 rows containing non-finite values (`stat_smooth()`).
ggplot(titanic,aes(x=age,y=survived,col=sex)) +
geom_smooth(method="loess") +
ylim (0,1)
## Don't know how to automatically pick scale for object of type <labelled>.
## Defaulting to continuous.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 263 rows containing non-finite values (`stat_smooth()`).
The plot is showing the relationship between age and survival, grouped by sex. The trendline shows the general relationship between age and survival, with higher survival rates among younger passengers and lower survival rates among older passengers. The trendline is different for males and females, which suggests that the relationship between age and survival varies by sex.
When examining each sex separately, the effect of age is more noticeable. Specifically, older females are more likely to survive than younger females, whereas the opposite is true for males.
ggplot(titanic, aes(x = age, y = survived, col = pclass)) + geom_smooth(method = "loess") + ylim (0,1)
## Don't know how to automatically pick scale for object of type <labelled>.
## Defaulting to continuous.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 263 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 5 rows containing missing values (`geom_smooth()`).
ggplot(titanic, aes(x = age, y = survived, col = pclass)) + facet_grid(~sex) + geom_smooth(method = "loess") + ylim (0,1)
## Don't know how to automatically pick scale for object of type <labelled>.
## Defaulting to continuous.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 263 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 23 rows containing missing values (`geom_smooth()`).
# --> face_grid: this function used to create separate plots for each combination of sex and pclass
ggplot(titanic, aes(x = fare, y = survived)) + geom_smooth(method = "loess") + ylim(0,1)
## Don't know how to automatically pick scale for object of type <labelled>.
## Defaulting to continuous.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
# --> positive impact on the survival
ggplot(data = titanic, mapping = aes(fare, fill = fare)) + geom_histogram()
## Don't know how to automatically pick scale for object of type <labelled>.
## Defaulting to continuous.
## Don't know how to automatically pick scale for object of type <labelled>.
## Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
ggplot(titanic,aes(fare)) + geom_histogram() + facet_grid(~pclass)
## Don't know how to automatically pick scale for object of type <labelled>.
## Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
colSums(is.na(titanic))
## pclass survived name sex age sibsp parch ticket
## 0 0 0 0 263 0 0 0
## fare cabin embarked boat body home.dest
## 1 0 2 0 1188 0
md.pattern(titanic)
## pclass survived name sex sibsp parch ticket cabin boat home.dest fare
## 119 1 1 1 1 1 1 1 1 1 1 1
## 924 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 262 1 1 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 0
## 0 0 0 0 0 0 0 0 0 0 1
## embarked age body
## 119 1 1 1 0
## 924 1 1 0 1
## 1 1 0 1 1
## 262 1 0 0 2
## 2 0 1 0 2
## 1 1 1 1 1
## 2 263 1188 1454
md.pattern(titanic2)
## sex ticket pclass sibsp parch survived fare age
## 1045 1 1 1 1 1 1 1 1 0
## 263 1 1 1 1 1 1 1 0 1
## 1 1 1 1 1 1 1 0 1 1
## 0 0 0 0 0 0 1 263 264
titanic[which(is.na(titanic$fare)),]
## pclass survived name sex age sibsp parch ticket fare cabin
## 1226 3rd 0 Storey, Mr. Thomas male 60.5 0 0 3701 NA
## embarked boat body home.dest
## 1226 Southampton 261
titanic2[which(is.na(titanic2$fare)),]
## age sex ticket fare pclass sibsp parch survived
## 1226 60.5 male 3701 NA 3rd 0 0 0
summary(aov(fare~pclass,data=titanic)) # --> It is statistically significant that the means of the fares differ amongst the classes.
## Df Sum Sq Mean Sq F value Pr(>F)
## pclass 2 1272986 636493 372.7 <2e-16 ***
## Residuals 1305 2228415 1708
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness
TukeyHSD(aov(fare~pclass,data=titanic)) # --> Tukey test to see where the difference is
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = fare ~ pclass, data = titanic)
##
## $pclass
## diff lwr upr p adj
## 2nd-1st -66.329795 -74.26990 -58.389693 0.0000000
## 3rd-1st -74.206103 -80.71643 -67.695772 0.0000000
## 3rd-2nd -7.876308 -14.74783 -1.004781 0.0198199
# The difference between each class is statistically significant.
summary(is.na(age) ~ survived, data = titanic)
## is.na(age) N= 1309
##
## +--------+---+----+----------+
## | | | N|is.na(age)|
## +--------+---+----+----------+
## |Survived| No| 809| 0.2348578|
## | |Yes| 500| 0.1460000|
## +--------+---+----+----------+
## | Overall| |1309| 0.2009167|
## +--------+---+----+----------+
age.na<-glm(is.na(age)~sex+pclass+survived+parch+sibsp,data=titanic)
summary(age.na)
##
## Call:
## glm(formula = is.na(age) ~ sex + pclass + survived + parch +
## sibsp, data = titanic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.3222 -0.2845 -0.1228 -0.0410 1.0389
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.157060 0.036134 4.347 1.49e-05 ***
## sexmale -0.002480 0.026931 -0.092 0.92665
## pclass2nd -0.069131 0.031976 -2.162 0.03080 *
## pclass3rd 0.161677 0.027340 5.914 4.27e-09 ***
## survived -0.034264 0.027185 -1.260 0.20775
## parch -0.039656 0.013551 -2.927 0.00349 **
## sibsp 0.001743 0.011134 0.157 0.87561
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1493451)
##
## Null deviance: 210.16 on 1308 degrees of freedom
## Residual deviance: 194.45 on 1302 degrees of freedom
## AIC: 1234.7
##
## Number of Fisher Scoring iterations: 2
## Make it reproducible
set.seed(13032019)
#use 80% of dataset as training set and 20% as test set
sample <- sample(c(TRUE, FALSE), nrow(titanic2), replace=TRUE, prob=c(0.8,0.2))
train.titanic <- titanic2[sample, ]
test.titanic <- titanic2[!sample, ]
md.pattern(test.titanic)
## sex ticket fare pclass sibsp parch survived age
## 210 1 1 1 1 1 1 1 1 0
## 53 1 1 1 1 1 1 1 0 1
## 0 0 0 0 0 0 0 53 53
md.pattern(train.titanic)
## sex ticket pclass sibsp parch survived fare age
## 835 1 1 1 1 1 1 1 1 0
## 210 1 1 1 1 1 1 1 0 1
## 1 1 1 1 1 1 1 0 1 1
## 0 0 0 0 0 0 1 210 211
set.seed(13032019)
imputed_data<-mice(test.titanic,m=5,maxit = 50,method = 'rf')
##
## iter imp variable
## 1 1 age
## 1 2 age
## 1 3 age
## 1 4 age
## 1 5 age
## 2 1 age
## 2 2 age
## 2 3 age
## 2 4 age
## 2 5 age
## 3 1 age
## 3 2 age
## 3 3 age
## 3 4 age
## 3 5 age
## 4 1 age
## 4 2 age
## 4 3 age
## 4 4 age
## 4 5 age
## 5 1 age
## 5 2 age
## 5 3 age
## 5 4 age
## 5 5 age
## 6 1 age
## 6 2 age
## 6 3 age
## 6 4 age
## 6 5 age
## 7 1 age
## 7 2 age
## 7 3 age
## 7 4 age
## 7 5 age
## 8 1 age
## 8 2 age
## 8 3 age
## 8 4 age
## 8 5 age
## 9 1 age
## 9 2 age
## 9 3 age
## 9 4 age
## 9 5 age
## 10 1 age
## 10 2 age
## 10 3 age
## 10 4 age
## 10 5 age
## 11 1 age
## 11 2 age
## 11 3 age
## 11 4 age
## 11 5 age
## 12 1 age
## 12 2 age
## 12 3 age
## 12 4 age
## 12 5 age
## 13 1 age
## 13 2 age
## 13 3 age
## 13 4 age
## 13 5 age
## 14 1 age
## 14 2 age
## 14 3 age
## 14 4 age
## 14 5 age
## 15 1 age
## 15 2 age
## 15 3 age
## 15 4 age
## 15 5 age
## 16 1 age
## 16 2 age
## 16 3 age
## 16 4 age
## 16 5 age
## 17 1 age
## 17 2 age
## 17 3 age
## 17 4 age
## 17 5 age
## 18 1 age
## 18 2 age
## 18 3 age
## 18 4 age
## 18 5 age
## 19 1 age
## 19 2 age
## 19 3 age
## 19 4 age
## 19 5 age
## 20 1 age
## 20 2 age
## 20 3 age
## 20 4 age
## 20 5 age
## 21 1 age
## 21 2 age
## 21 3 age
## 21 4 age
## 21 5 age
## 22 1 age
## 22 2 age
## 22 3 age
## 22 4 age
## 22 5 age
## 23 1 age
## 23 2 age
## 23 3 age
## 23 4 age
## 23 5 age
## 24 1 age
## 24 2 age
## 24 3 age
## 24 4 age
## 24 5 age
## 25 1 age
## 25 2 age
## 25 3 age
## 25 4 age
## 25 5 age
## 26 1 age
## 26 2 age
## 26 3 age
## 26 4 age
## 26 5 age
## 27 1 age
## 27 2 age
## 27 3 age
## 27 4 age
## 27 5 age
## 28 1 age
## 28 2 age
## 28 3 age
## 28 4 age
## 28 5 age
## 29 1 age
## 29 2 age
## 29 3 age
## 29 4 age
## 29 5 age
## 30 1 age
## 30 2 age
## 30 3 age
## 30 4 age
## 30 5 age
## 31 1 age
## 31 2 age
## 31 3 age
## 31 4 age
## 31 5 age
## 32 1 age
## 32 2 age
## 32 3 age
## 32 4 age
## 32 5 age
## 33 1 age
## 33 2 age
## 33 3 age
## 33 4 age
## 33 5 age
## 34 1 age
## 34 2 age
## 34 3 age
## 34 4 age
## 34 5 age
## 35 1 age
## 35 2 age
## 35 3 age
## 35 4 age
## 35 5 age
## 36 1 age
## 36 2 age
## 36 3 age
## 36 4 age
## 36 5 age
## 37 1 age
## 37 2 age
## 37 3 age
## 37 4 age
## 37 5 age
## 38 1 age
## 38 2 age
## 38 3 age
## 38 4 age
## 38 5 age
## 39 1 age
## 39 2 age
## 39 3 age
## 39 4 age
## 39 5 age
## 40 1 age
## 40 2 age
## 40 3 age
## 40 4 age
## 40 5 age
## 41 1 age
## 41 2 age
## 41 3 age
## 41 4 age
## 41 5 age
## 42 1 age
## 42 2 age
## 42 3 age
## 42 4 age
## 42 5 age
## 43 1 age
## 43 2 age
## 43 3 age
## 43 4 age
## 43 5 age
## 44 1 age
## 44 2 age
## 44 3 age
## 44 4 age
## 44 5 age
## 45 1 age
## 45 2 age
## 45 3 age
## 45 4 age
## 45 5 age
## 46 1 age
## 46 2 age
## 46 3 age
## 46 4 age
## 46 5 age
## 47 1 age
## 47 2 age
## 47 3 age
## 47 4 age
## 47 5 age
## 48 1 age
## 48 2 age
## 48 3 age
## 48 4 age
## 48 5 age
## 49 1 age
## 49 2 age
## 49 3 age
## 49 4 age
## 49 5 age
## 50 1 age
## 50 2 age
## 50 3 age
## 50 4 age
## 50 5 age
## Warning: Number of logged events: 1
mean(titanic$age,na.rm = TRUE) ##the original data's mean value
## [1] 29.88113
lapply(imputed_data$imp$age,mean) ##each imputed set's mean
## $`1`
## [1] 24.62264
##
## $`2`
## [1] 29
##
## $`3`
## [1] 30.32075
##
## $`4`
## [1] 25.5
##
## $`5`
## [1] 25.5566
sd(titanic$age,na.rm = TRUE) ##the original data's standard deviation value
## [1] 14.4135
lapply(imputed_data$imp$age,sd) ##each imputed set's standard deviation
## $`1`
## [1] 8.792428
##
## $`2`
## [1] 12.72868
##
## $`3`
## [1] 13.86962
##
## $`4`
## [1] 10.80064
##
## $`5`
## [1] 9.718965
# So we will choose the 2nd set
titanic.train.complete<-complete(imputed_data, 3)
##assign the survived variable from the test.titanic to to "actual"
actual <- test.titanic$survived
##drop it from the dataset
test.titanic <- select(test.titanic,-survived)
set.seed(13032019)
imputed_data.test<-mice(test.titanic,m=5,maxit = 50,method = 'rf')
##
## iter imp variable
## 1 1 age
## 1 2 age
## 1 3 age
## 1 4 age
## 1 5 age
## 2 1 age
## 2 2 age
## 2 3 age
## 2 4 age
## 2 5 age
## 3 1 age
## 3 2 age
## 3 3 age
## 3 4 age
## 3 5 age
## 4 1 age
## 4 2 age
## 4 3 age
## 4 4 age
## 4 5 age
## 5 1 age
## 5 2 age
## 5 3 age
## 5 4 age
## 5 5 age
## 6 1 age
## 6 2 age
## 6 3 age
## 6 4 age
## 6 5 age
## 7 1 age
## 7 2 age
## 7 3 age
## 7 4 age
## 7 5 age
## 8 1 age
## 8 2 age
## 8 3 age
## 8 4 age
## 8 5 age
## 9 1 age
## 9 2 age
## 9 3 age
## 9 4 age
## 9 5 age
## 10 1 age
## 10 2 age
## 10 3 age
## 10 4 age
## 10 5 age
## 11 1 age
## 11 2 age
## 11 3 age
## 11 4 age
## 11 5 age
## 12 1 age
## 12 2 age
## 12 3 age
## 12 4 age
## 12 5 age
## 13 1 age
## 13 2 age
## 13 3 age
## 13 4 age
## 13 5 age
## 14 1 age
## 14 2 age
## 14 3 age
## 14 4 age
## 14 5 age
## 15 1 age
## 15 2 age
## 15 3 age
## 15 4 age
## 15 5 age
## 16 1 age
## 16 2 age
## 16 3 age
## 16 4 age
## 16 5 age
## 17 1 age
## 17 2 age
## 17 3 age
## 17 4 age
## 17 5 age
## 18 1 age
## 18 2 age
## 18 3 age
## 18 4 age
## 18 5 age
## 19 1 age
## 19 2 age
## 19 3 age
## 19 4 age
## 19 5 age
## 20 1 age
## 20 2 age
## 20 3 age
## 20 4 age
## 20 5 age
## 21 1 age
## 21 2 age
## 21 3 age
## 21 4 age
## 21 5 age
## 22 1 age
## 22 2 age
## 22 3 age
## 22 4 age
## 22 5 age
## 23 1 age
## 23 2 age
## 23 3 age
## 23 4 age
## 23 5 age
## 24 1 age
## 24 2 age
## 24 3 age
## 24 4 age
## 24 5 age
## 25 1 age
## 25 2 age
## 25 3 age
## 25 4 age
## 25 5 age
## 26 1 age
## 26 2 age
## 26 3 age
## 26 4 age
## 26 5 age
## 27 1 age
## 27 2 age
## 27 3 age
## 27 4 age
## 27 5 age
## 28 1 age
## 28 2 age
## 28 3 age
## 28 4 age
## 28 5 age
## 29 1 age
## 29 2 age
## 29 3 age
## 29 4 age
## 29 5 age
## 30 1 age
## 30 2 age
## 30 3 age
## 30 4 age
## 30 5 age
## 31 1 age
## 31 2 age
## 31 3 age
## 31 4 age
## 31 5 age
## 32 1 age
## 32 2 age
## 32 3 age
## 32 4 age
## 32 5 age
## 33 1 age
## 33 2 age
## 33 3 age
## 33 4 age
## 33 5 age
## 34 1 age
## 34 2 age
## 34 3 age
## 34 4 age
## 34 5 age
## 35 1 age
## 35 2 age
## 35 3 age
## 35 4 age
## 35 5 age
## 36 1 age
## 36 2 age
## 36 3 age
## 36 4 age
## 36 5 age
## 37 1 age
## 37 2 age
## 37 3 age
## 37 4 age
## 37 5 age
## 38 1 age
## 38 2 age
## 38 3 age
## 38 4 age
## 38 5 age
## 39 1 age
## 39 2 age
## 39 3 age
## 39 4 age
## 39 5 age
## 40 1 age
## 40 2 age
## 40 3 age
## 40 4 age
## 40 5 age
## 41 1 age
## 41 2 age
## 41 3 age
## 41 4 age
## 41 5 age
## 42 1 age
## 42 2 age
## 42 3 age
## 42 4 age
## 42 5 age
## 43 1 age
## 43 2 age
## 43 3 age
## 43 4 age
## 43 5 age
## 44 1 age
## 44 2 age
## 44 3 age
## 44 4 age
## 44 5 age
## 45 1 age
## 45 2 age
## 45 3 age
## 45 4 age
## 45 5 age
## 46 1 age
## 46 2 age
## 46 3 age
## 46 4 age
## 46 5 age
## 47 1 age
## 47 2 age
## 47 3 age
## 47 4 age
## 47 5 age
## 48 1 age
## 48 2 age
## 48 3 age
## 48 4 age
## 48 5 age
## 49 1 age
## 49 2 age
## 49 3 age
## 49 4 age
## 49 5 age
## 50 1 age
## 50 2 age
## 50 3 age
## 50 4 age
## 50 5 age
## Warning: Number of logged events: 1
mean(titanic$age, na.rm = TRUE) ##the original data's mean value
## [1] 29.88113
lapply(imputed_data.test$imp$age,mean) ##each imputed set's mean
## $`1`
## [1] 29.57547
##
## $`2`
## [1] 24.76415
##
## $`3`
## [1] 26.62736
##
## $`4`
## [1] 24.76101
##
## $`5`
## [1] 25.96226
sd(titanic$age, na.rm = TRUE) ##the original data's standard deviation value
## [1] 14.4135
lapply(imputed_data.test$imp$age,sd) ####each imputed set's standard deviation
## $`1`
## [1] 12.62606
##
## $`2`
## [1] 7.811852
##
## $`3`
## [1] 11.43624
##
## $`4`
## [1] 10.28185
##
## $`5`
## [1] 11.60398
titanic.test.complete<-complete(imputed_data.test, 1)
titanic.train.complete$sibsp<-as.factor(titanic.train.complete$sibsp)
titanic.train.complete$parch<-as.factor(titanic.train.complete$parch)
titanic.test.complete$sibsp<-as.factor(titanic.test.complete$sibsp)
titanic.test.complete$parch<-as.factor(titanic.test.complete$parch)
plot(summary(survived~sex+pclass+age+parch+sibsp,data=titanic3))
dd <- datadist (titanic.train.complete)
options(datadist = 'dd')
fit1<-lrm(survived ~ age+sex+pclass+sibsp+parch+fare,data = titanic.train.complete)
fit1
## Logistic Regression Model
##
## lrm(formula = survived ~ age + sex + pclass + sibsp + parch +
## fare, data = titanic.train.complete)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 263 LR chi2 128.68 R2 0.520 C 0.868
## 0 152 d.f. 16 R2(16,263)0.348 Dxy 0.735
## 1 111 Pr(> chi2) <0.0001 R2(16,192.5)0.443 gamma 0.736
## max |deriv| 0.06 Brier 0.141 tau-a 0.360
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept 5.0284 0.9145 5.50 <0.0001
## age -0.0586 0.0155 -3.77 0.0002
## sex=male -2.2225 0.3581 -6.21 <0.0001
## pclass=2nd -1.9852 0.5756 -3.45 0.0006
## pclass=3rd -3.2222 0.5996 -5.37 <0.0001
## sibsp=1 -0.7113 0.4382 -1.62 0.1045
## sibsp=2 -0.9224 1.1272 -0.82 0.4132
## sibsp=3 -10.2403 40.0022 -0.26 0.7980
## sibsp=4 -2.7062 1.3723 -1.97 0.0486
## sibsp=5 -8.5961 61.5411 -0.14 0.8889
## sibsp=8 -7.5268 61.5406 -0.12 0.9027
## parch=1 1.1520 0.5653 2.04 0.0416
## parch=2 0.9823 0.8407 1.17 0.2427
## parch=3 9.1553 61.7992 0.15 0.8822
## parch=4 -6.7969 61.5367 -0.11 0.9120
## parch=5 -4.6652 61.5373 -0.08 0.9396
## fare -0.0032 0.0045 -0.71 0.4747
##
fit2<-lrm(survived ~ pclass+sex*rcs(age,3)+sibsp+parch+fare,data=titanic.train.complete)
fit2
## Logistic Regression Model
##
## lrm(formula = survived ~ pclass + sex * rcs(age, 3) + sibsp +
## parch + fare, data = titanic.train.complete)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 263 LR chi2 142.51 R2 0.562 C 0.882
## 0 152 d.f. 19 R2(19,263)0.375 Dxy 0.764
## 1 111 Pr(> chi2) <0.0001 R2(19,192.5)0.474 gamma 0.764
## max |deriv| 0.05 Brier 0.134 tau-a 0.374
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept 5.2679 1.3062 4.03 <0.0001
## pclass=2nd -2.8396 0.7405 -3.83 0.0001
## pclass=3rd -3.9390 0.7418 -5.31 <0.0001
## sex=male 0.1059 1.4231 0.07 0.9407
## age -0.0538 0.0468 -1.15 0.2507
## age' 0.0871 0.0883 0.99 0.3239
## sibsp=1 -0.7915 0.4612 -1.72 0.0862
## sibsp=2 -0.8756 1.0085 -0.87 0.3852
## sibsp=3 -10.4904 43.2428 -0.24 0.8083
## sibsp=4 -3.1534 1.3363 -2.36 0.0183
## sibsp=5 -10.4083 61.5493 -0.17 0.8657
## sibsp=8 -8.0061 61.5414 -0.13 0.8965
## parch=1 1.2378 0.6452 1.92 0.0550
## parch=2 1.2089 0.8847 1.37 0.1718
## parch=3 7.8746 61.8017 0.13 0.8986
## parch=4 -8.2853 61.5394 -0.13 0.8929
## parch=5 -3.6351 61.5388 -0.06 0.9529
## fare -0.0073 0.0056 -1.31 0.1889
## sex=male * age -0.0778 0.0654 -1.19 0.2336
## sex=male * age' -0.0582 0.1258 -0.46 0.6435
##
# Trying to increase the number of knots on restricted cubic age, eliminating the parch variable due to its insignificance, and defining interaction terms between pclass and sex.
fit3<-lrm(survived ~ sex*rcs(age,5) + sibsp + pclass*sex+fare,data=titanic.train.complete)
fit3
## Logistic Regression Model
##
## lrm(formula = survived ~ sex * rcs(age, 5) + sibsp + pclass *
## sex + fare, data = titanic.train.complete)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 263 LR chi2 151.12 R2 0.588 C 0.884
## 0 152 d.f. 20 R2(20,263)0.393 Dxy 0.768
## 1 111 Pr(> chi2) <0.0001 R2(20,192.5)0.494 gamma 0.768
## max |deriv| 0.5 Brier 0.127 tau-a 0.376
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept 23.3049 31.9977 0.73 0.4664
## sex=male -16.3895 31.8109 -0.52 0.6064
## age -0.0249 0.1022 -0.24 0.8076
## age' -0.8671 0.7608 -1.14 0.2544
## age'' 10.7499 7.4198 1.45 0.1474
## age''' -17.4453 11.6177 -1.50 0.1332
## sibsp=1 -0.2438 0.5187 -0.47 0.6383
## sibsp=2 -0.3255 1.0317 -0.32 0.7524
## sibsp=3 -13.1538 192.4452 -0.07 0.9455
## sibsp=4 -2.3974 1.2926 -1.85 0.0636
## sibsp=5 -13.1330 275.7687 -0.05 0.9620
## sibsp=8 -8.9991 275.7655 -0.03 0.9740
## pclass=2nd -18.7704 31.9265 -0.59 0.5566
## pclass=3rd -21.3330 31.9562 -0.67 0.5044
## fare -0.0258 0.0156 -1.66 0.0969
## sex=male * age -0.1951 0.1611 -1.21 0.2258
## sex=male * age' 1.1215 0.9947 1.13 0.2595
## sex=male * age'' -10.3224 9.2079 -1.12 0.2623
## sex=male * age''' 15.2048 14.3160 1.06 0.2882
## sex=male * pclass=2nd 15.6556 31.8079 0.49 0.6226
## sex=male * pclass=3rd 17.7430 31.8115 0.56 0.5770
##
fit4<-lrm(survived~sex*rcs(age,5)+ sibsp + pclass*sex,data=titanic.train.complete)
fit4
## Logistic Regression Model
##
## lrm(formula = survived ~ sex * rcs(age, 5) + sibsp + pclass *
## sex, data = titanic.train.complete)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 263 LR chi2 147.66 R2 0.578 C 0.879
## 0 152 d.f. 19 R2(19,263)0.387 Dxy 0.758
## 1 111 Pr(> chi2) <0.0001 R2(19,192.5)0.488 gamma 0.762
## max |deriv| 0.06 Brier 0.131 tau-a 0.371
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept 13.1778 27.4628 0.48 0.6313
## sex=male -7.6017 27.5221 -0.28 0.7824
## age -0.0322 0.1022 -0.32 0.7525
## age' -0.8413 0.7611 -1.11 0.2690
## age'' 10.5137 7.4349 1.41 0.1573
## age''' -17.0725 11.6639 -1.46 0.1433
## sibsp=1 -0.6675 0.4753 -1.40 0.1603
## sibsp=2 -0.7980 0.9844 -0.81 0.4175
## sibsp=3 -12.5523 116.9498 -0.11 0.9145
## sibsp=4 -2.9065 1.3039 -2.23 0.0258
## sibsp=5 -13.1022 167.2688 -0.08 0.9376
## sibsp=8 -9.6145 167.2603 -0.06 0.9542
## pclass=2nd -8.8170 27.4348 -0.32 0.7479
## pclass=3rd -11.2348 27.4352 -0.41 0.6822
## sex=male * age -0.1826 0.1644 -1.11 0.2667
## sex=male * age' 1.0593 1.0005 1.06 0.2897
## sex=male * age'' -9.5153 9.1807 -1.04 0.3000
## sex=male * age''' 13.7797 14.2408 0.97 0.3332
## sex=male * pclass=2nd 6.6230 27.4456 0.24 0.8093
## sex=male * pclass=3rd 8.7366 27.4436 0.32 0.7502
##
anova(fit4)
## Wald Statistics Response: survived
##
## Factor Chi-Square d.f. P
## sex (Factor+Higher Order Factors) 26.50 7 0.0004
## All Interactions 10.76 6 0.0961
## age (Factor+Higher Order Factors) 23.96 8 0.0023
## All Interactions 3.14 4 0.5346
## Nonlinear (Factor+Higher Order Factors) 6.02 6 0.4206
## sibsp 6.19 6 0.4022
## pclass (Factor+Higher Order Factors) 22.11 4 0.0002
## All Interactions 5.04 2 0.0803
## sex * age (Factor+Higher Order Factors) 3.14 4 0.5346
## Nonlinear 1.22 3 0.7485
## Nonlinear Interaction : f(A,B) vs. AB 1.22 3 0.7485
## sex * pclass (Factor+Higher Order Factors) 5.04 2 0.0803
## TOTAL NONLINEAR 6.02 6 0.4206
## TOTAL INTERACTION 10.76 6 0.0961
## TOTAL NONLINEAR + INTERACTION 14.45 9 0.1072
## TOTAL 47.34 19 0.0003
ggplot(Predict(fit4, age , sex , pclass, sibsp=0 , fun=plogis))
test<-predict(fit4, titanic.test.complete) ## apply the model to the test data
test<-ifelse(test > 0.5, 1,0) ## convert the probability value to survive or not.
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
## The following object is masked from 'package:purrr':
##
## lift
confusionMatrix(table(test,actual), positive = "1") ## compare the model's output to the actual data
## Confusion Matrix and Statistics
##
## actual
## test 0 1
## 0 142 38
## 1 10 73
##
## Accuracy : 0.8175
## 95% CI : (0.7654, 0.8623)
## No Information Rate : 0.5779
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6127
##
## Mcnemar's Test P-Value : 9.735e-05
##
## Sensitivity : 0.6577
## Specificity : 0.9342
## Pos Pred Value : 0.8795
## Neg Pred Value : 0.7889
## Prevalence : 0.4221
## Detection Rate : 0.2776
## Detection Prevalence : 0.3156
## Balanced Accuracy : 0.7959
##
## 'Positive' Class : 1
##