library(ggplot2)
library(ggthemes)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
setwd("/Users/milosjanicki/Data_projects/Date_Guide")
titanic_train <- read.csv('Titanic/train.csv')
titanic_test <- read.csv('Titanic/test.csv')
titanic_test$Survived <- NA
titanic <- rbind(titanic_train, titanic_test)
survival ??? Survival (0 = No; 1 = Yes) pclass ??? Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd) name ??? Name sex ??? Sex age ??? Age sibsp ??? Number of Siblings/Spouses Aboard parch ??? Number of Parents/Children Aboard ticket ??? Ticket Number fare ??? Passenger Fare cabin ??? Cabin embarked ??? Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
str(titanic)
## 'data.frame': 1309 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 354 273 16 555 516 625 413 577 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 187 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
head(titanic)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp
## 1 Braund, Mr. Owen Harris male 22 1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
## 3 Heikkinen, Miss. Laina female 26 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
## 5 Allen, Mr. William Henry male 35 0
## 6 Moran, Mr. James male NA 0
## Parch Ticket Fare Cabin Embarked
## 1 0 A/5 21171 7.2500 S
## 2 0 PC 17599 71.2833 C85 C
## 3 0 STON/O2. 3101282 7.9250 S
## 4 0 113803 53.1000 C123 S
## 5 0 373450 8.0500 S
## 6 0 330877 8.4583 Q
Let’s take a look at a bar plot of survival rates:
ggplot(titanic_train, aes(factor(Survived))) +
geom_bar()+
labs(x = "Survived",y = "Passangers")+
stat_count(aes(label = paste(prop.table(..count..) * 100, "%", sep = "")),
vjust = 1, geom = "text", position = "identity", color ="white")+
theme_economist() + scale_colour_economist()
Roughly 2/5 survived…not so bad.
Let’s take one more look at our data - let’s check how many levels our variables have
sapply(titanic, function(x) length(unique(x)))
## PassengerId Survived Pclass Name Sex Age
## 1309 3 3 1307 2 99
## SibSp Parch Ticket Fare Cabin Embarked
## 7 8 929 282 187 4
(don’t be surprised that Survived has 3 levels now - it is because we combined the train and test data set)
It seems reasonable to discard some variables such as : PassangerID, Name,Ticket since they have almost as many values as observations
titanic <- titanic %>% select(-one_of(c("Name", "Ticket")))
Several variables however require transformation into factors: Cabin and Age
Cabin variables combines deck and room data. Let’s try to split this as that might give as an info on the location of the passangers’ cabin.
As we know the titanic cracked in the middle, opening up for the water to flood the decks. The ship sank elevating the bow part while the rear sank first. Perhaps the passangers that were remote from the central part had higher chances of survival? or perhaps those that were in the bow part of this ship as that part sank the latest?
titanic$Deck <- substr(titanic$Cabin, 0, 1)
titanic$Deck <- as.factor(titanic$Deck)
titanic$Location <- substr(titanic$Cabin, 2, 3)
titanic$Location <- as.numeric(titanic$Location)
## Warning: NAs introduced by coercion
summary(titanic$Location)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.00 13.00 33.00 37.49 53.75 99.00 1027
titanic$Location2[titanic$Location < 14] <- 'The Bow'
titanic$Location2[titanic$Location < 54 & titanic$Location > 13] <- 'The Middle'
titanic$Location2[titanic$Location > 53 & titanic$Location < 100] <- 'The Rear'
titanic$Location <- titanic$Location2
titanic$Location <- as.factor(titanic$Location)
titanic <- titanic %>% select(-one_of(c("Cabin","Location2")))
Let’s see if we made something meaningful
prop.table(table(titanic$Location, titanic$Survived),1)
##
## 0 1
## The Bow 0.3076923 0.6923077
## The Middle 0.3052632 0.6947368
## The Rear 0.3673469 0.6326531
prop.table(table(titanic$Deck, titanic$Survived),1)
##
## 0 1
## 0.7001456 0.2998544
## A 0.5333333 0.4666667
## B 0.2553191 0.7446809
## C 0.4067797 0.5932203
## D 0.2424242 0.7575758
## E 0.2500000 0.7500000
## F 0.3846154 0.6153846
## G 0.5000000 0.5000000
## T 1.0000000 0.0000000
As expected passangers in the rear part have slightly lower chances of survival. The central cabins seem not to be affected. Certain decks seem to have larger chances of survival, let’s hope it is not merely noise.
Also let’s create a new variable for children as it seems to be a good idea
titanic$Age[is.na(titanic$Age)] <- mean(titanic$Age, na.rm=TRUE)
titanic$Age_cat <- NA
titanic$Age_cat[titanic$Age < 16] <- "Child"
titanic$Age_cat[titanic$Age >= 16] <- "Adult"
titanic$Age_cat <- as.factor(titanic$Age_cat)
Let’s Create a family size vairable too
titanic$family_size <- NA
titanic$family_size <- titanic$SibSp + titanic$Parch + 1
summary(titanic$family_size)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.884 2.000 11.000
titanic$family_size <- as.factor(titanic$family_size)
Let’s see if we did something meaningul
prop.table(table(titanic$family_size, titanic$Survived),1)
##
## 0 1
## 1 0.6964618 0.3035382
## 2 0.4472050 0.5527950
## 3 0.4215686 0.5784314
## 4 0.2758621 0.7241379
## 5 0.8000000 0.2000000
## 6 0.8636364 0.1363636
## 7 0.6666667 0.3333333
## 8 1.0000000 0.0000000
## 11 1.0000000 0.0000000
prop.table(table(titanic$Age_cat, titanic$Survived),1)
##
## 0 1
## Adult 0.6373762 0.3626238
## Child 0.4096386 0.5903614
Children have a better survival rate. When it comes to the family size large families are doing very badly while couples alone or couples with a child (or another family member) seem to have the highest surival rates.
Now the dataset looks better but we have to deal with the problem of missing data
First let’s fill in the empty cells with NA values
#titanic[sapply(titanic, function(x) as.character(x)=="" & as.character(x)=="NA" )] <- NA
sum(is.na(titanic))
## [1] 1446
sapply(titanic,function(x) sum(is.na(x)))
## PassengerId Survived Pclass Sex Age SibSp
## 0 418 0 0 0 0
## Parch Fare Embarked Deck Location Age_cat
## 0 1 0 0 1027 0
## family_size
## 0
missmap(titanic, main = "Missing values vs observed")
We still have 1972 NA values in Location, Deck, Embarked, Cabin and Age variables
How to deal with them?
#Let's fill in the Age with median value
titanic$Age[is.na(titanic$Age)] <- mean(titanic$Age, na.rm=TRUE)
titanic$Fare[is.na(titanic$Fare)] <- mean(titanic$Fare, na.rm=TRUE)
titanic$Sex<- as.factor(titanic$Sex)
titanic$Embarked<- as.factor(titanic$Embarked)
titanic$Survived <- as.factor(titanic$Survived)
titanic$family_size <- as.factor(titanic$family_size)
sapply(titanic,function(x) sum(is.na(x)))
## PassengerId Survived Pclass Sex Age SibSp
## 0 418 0 0 0 0
## Parch Fare Embarked Deck Location Age_cat
## 0 0 0 0 1027 0
## family_size
## 0
missmap(titanic, main = "Missing values vs observed")
Model will deal with categorical variables on its own from what I understood as nobody on the web bothers to fill in the NA’s with randomized distribution of some sort
Now let’s take a look at our survival predictions:
titanic_check <- titanic %>% select(-one_of(c("PassengerId","Name")))
## Warning in one_of(c("PassengerId", "Name")): Unknown variables: `Name`
sapply(titanic_check,function(x) prop.table(table(x,titanic_check$Survived),1))
## $Survived
##
## x 0 1
## 0 1 0
## 1 0 1
##
## $Pclass
##
## x 0 1
## 1 0.3703704 0.6296296
## 2 0.5271739 0.4728261
## 3 0.7576375 0.2423625
##
## $Sex
##
## x 0 1
## female 0.2579618 0.7420382
## male 0.8110919 0.1889081
##
## $Age
##
## x 0 1
## 0.17
## 0.33
## 0.42 0.0000000 1.0000000
## 0.67 0.0000000 1.0000000
## 0.75 0.0000000 1.0000000
## 0.83 0.0000000 1.0000000
## 0.92 0.0000000 1.0000000
## 1 0.2857143 0.7142857
## 2 0.7000000 0.3000000
## 3 0.1666667 0.8333333
## 4 0.3000000 0.7000000
## 5 0.0000000 1.0000000
## 6 0.3333333 0.6666667
## 7 0.6666667 0.3333333
## 8 0.5000000 0.5000000
## 9 0.7500000 0.2500000
## 10 1.0000000 0.0000000
## 11 0.7500000 0.2500000
## 11.5
## 12 0.0000000 1.0000000
## 13 0.0000000 1.0000000
## 14 0.5000000 0.5000000
## 14.5 1.0000000 0.0000000
## 15 0.2000000 0.8000000
## 16 0.6470588 0.3529412
## 17 0.5384615 0.4615385
## 18 0.6538462 0.3461538
## 18.5
## 19 0.6400000 0.3600000
## 20 0.8000000 0.2000000
## 20.5 1.0000000 0.0000000
## 21 0.7916667 0.2083333
## 22 0.5925926 0.4074074
## 22.5
## 23 0.6666667 0.3333333
## 23.5 1.0000000 0.0000000
## 24 0.5000000 0.5000000
## 24.5 1.0000000 0.0000000
## 25 0.7391304 0.2608696
## 26 0.6666667 0.3333333
## 26.5
## 27 0.3888889 0.6111111
## 28 0.7200000 0.2800000
## 28.5 1.0000000 0.0000000
## 29 0.6000000 0.4000000
## 29.881137667304 0.7062147 0.2937853
## 30 0.6000000 0.4000000
## 30.5 1.0000000 0.0000000
## 31 0.5294118 0.4705882
## 32 0.5000000 0.5000000
## 32.5 0.5000000 0.5000000
## 33 0.6000000 0.4000000
## 34 0.6000000 0.4000000
## 34.5 1.0000000 0.0000000
## 35 0.3888889 0.6111111
## 36 0.5000000 0.5000000
## 36.5 1.0000000 0.0000000
## 37 0.8333333 0.1666667
## 38 0.5454545 0.4545455
## 38.5
## 39 0.6428571 0.3571429
## 40 0.5384615 0.4615385
## 40.5 1.0000000 0.0000000
## 41 0.6666667 0.3333333
## 42 0.5384615 0.4615385
## 43 0.8000000 0.2000000
## 44 0.6666667 0.3333333
## 45 0.5833333 0.4166667
## 45.5 1.0000000 0.0000000
## 46 1.0000000 0.0000000
## 47 0.8888889 0.1111111
## 48 0.3333333 0.6666667
## 49 0.3333333 0.6666667
## 50 0.5000000 0.5000000
## 51 0.7142857 0.2857143
## 52 0.5000000 0.5000000
## 53 0.0000000 1.0000000
## 54 0.6250000 0.3750000
## 55 0.5000000 0.5000000
## 55.5 1.0000000 0.0000000
## 56 0.5000000 0.5000000
## 57 1.0000000 0.0000000
## 58 0.4000000 0.6000000
## 59 1.0000000 0.0000000
## 60 0.5000000 0.5000000
## 60.5
## 61 1.0000000 0.0000000
## 62 0.5000000 0.5000000
## 63 0.0000000 1.0000000
## 64 1.0000000 0.0000000
## 65 1.0000000 0.0000000
## 66 1.0000000 0.0000000
## 67
## 70 1.0000000 0.0000000
## 70.5 1.0000000 0.0000000
## 71 1.0000000 0.0000000
## 74 1.0000000 0.0000000
## 76
## 80 0.0000000 1.0000000
##
## $SibSp
##
## x 0 1
## 0 0.6546053 0.3453947
## 1 0.4641148 0.5358852
## 2 0.5357143 0.4642857
## 3 0.7500000 0.2500000
## 4 0.8333333 0.1666667
## 5 1.0000000 0.0000000
## 8 1.0000000 0.0000000
##
## $Parch
##
## x 0 1
## 0 0.6563422 0.3436578
## 1 0.4491525 0.5508475
## 2 0.5000000 0.5000000
## 3 0.4000000 0.6000000
## 4 1.0000000 0.0000000
## 5 0.8000000 0.2000000
## 6 1.0000000 0.0000000
## 9
##
## $Fare
##
## x 0 1
## 0 0.93333333 0.06666667
## 3.1708
## 4.0125 1.00000000 0.00000000
## 5 1.00000000 0.00000000
## 6.2375 1.00000000 0.00000000
## 6.4375 1.00000000 0.00000000
## 6.45 1.00000000 0.00000000
## 6.4958 1.00000000 0.00000000
## 6.75 1.00000000 0.00000000
## 6.8583 1.00000000 0.00000000
## 6.95 1.00000000 0.00000000
## 6.975 0.50000000 0.50000000
## 7
## 7.0458 1.00000000 0.00000000
## 7.05 1.00000000 0.00000000
## 7.0542 1.00000000 0.00000000
## 7.125 1.00000000 0.00000000
## 7.1417 0.00000000 1.00000000
## 7.225 0.75000000 0.25000000
## 7.2292 0.73333333 0.26666667
## 7.25 0.92307692 0.07692308
## 7.2833
## 7.3125 1.00000000 0.00000000
## 7.4958 0.66666667 0.33333333
## 7.5208 1.00000000 0.00000000
## 7.55 0.75000000 0.25000000
## 7.575
## 7.5792
## 7.6292 1.00000000 0.00000000
## 7.65 0.75000000 0.25000000
## 7.7208
## 7.725 1.00000000 0.00000000
## 7.7292 1.00000000 0.00000000
## 7.7333 0.50000000 0.50000000
## 7.7375 0.50000000 0.50000000
## 7.7417 1.00000000 0.00000000
## 7.75 0.64705882 0.35294118
## 7.775 0.81250000 0.18750000
## 7.7792
## 7.7875 0.00000000 1.00000000
## 7.7958 0.66666667 0.33333333
## 7.8 1.00000000 0.00000000
## 7.8208
## 7.8292 0.50000000 0.50000000
## 7.85
## 7.8542 0.76923077 0.23076923
## 7.875 1.00000000 0.00000000
## 7.8792 0.00000000 1.00000000
## 7.8875 1.00000000 0.00000000
## 7.8958 0.97368421 0.02631579
## 7.925 0.55555556 0.44444444
## 8.0292 0.00000000 1.00000000
## 8.05 0.88372093 0.11627907
## 8.1125 0.00000000 1.00000000
## 8.1375 1.00000000 0.00000000
## 8.1583 1.00000000 0.00000000
## 8.3 1.00000000 0.00000000
## 8.3625 1.00000000 0.00000000
## 8.4042 1.00000000 0.00000000
## 8.4333 1.00000000 0.00000000
## 8.4583 1.00000000 0.00000000
## 8.5167 0.00000000 1.00000000
## 8.6542 1.00000000 0.00000000
## 8.6625 0.92307692 0.07692308
## 8.6833 0.00000000 1.00000000
## 8.7125 1.00000000 0.00000000
## 8.85 1.00000000 0.00000000
## 8.9625
## 9 1.00000000 0.00000000
## 9.2167 1.00000000 0.00000000
## 9.225 1.00000000 0.00000000
## 9.325
## 9.35 0.50000000 0.50000000
## 9.475 1.00000000 0.00000000
## 9.4833 1.00000000 0.00000000
## 9.5 0.77777778 0.22222222
## 9.5875 0.50000000 0.50000000
## 9.6875
## 9.825 1.00000000 0.00000000
## 9.8375 1.00000000 0.00000000
## 9.8417 0.00000000 1.00000000
## 9.8458 1.00000000 0.00000000
## 10.1708 1.00000000 0.00000000
## 10.4625 1.00000000 0.00000000
## 10.5 0.62500000 0.37500000
## 10.5167 1.00000000 0.00000000
## 10.7083
## 11.1333 0.00000000 1.00000000
## 11.2417 0.00000000 1.00000000
## 11.5 1.00000000 0.00000000
## 12 0.00000000 1.00000000
## 12.1833
## 12.275 1.00000000 0.00000000
## 12.2875 0.00000000 1.00000000
## 12.35 0.33333333 0.66666667
## 12.475 0.00000000 1.00000000
## 12.525 1.00000000 0.00000000
## 12.65 0.00000000 1.00000000
## 12.7375
## 12.875 1.00000000 0.00000000
## 13 0.61904762 0.38095238
## 13.4167 0.00000000 1.00000000
## 13.5 0.75000000 0.25000000
## 13.775
## 13.7917 0.00000000 1.00000000
## 13.8583 0.00000000 1.00000000
## 13.8625 0.00000000 1.00000000
## 13.9
## 14 1.00000000 0.00000000
## 14.1083 1.00000000 0.00000000
## 14.4 1.00000000 0.00000000
## 14.4542 0.85714286 0.14285714
## 14.4583 1.00000000 0.00000000
## 14.5 0.71428571 0.28571429
## 15 1.00000000 0.00000000
## 15.0333
## 15.0458 1.00000000 0.00000000
## 15.05 1.00000000 0.00000000
## 15.1 1.00000000 0.00000000
## 15.2458 0.40000000 0.60000000
## 15.5 0.62500000 0.37500000
## 15.55 1.00000000 0.00000000
## 15.5792
## 15.7417 0.00000000 1.00000000
## 15.75 0.00000000 1.00000000
## 15.85 0.50000000 0.50000000
## 15.9 0.00000000 1.00000000
## 16 0.00000000 1.00000000
## 16.1 0.77777778 0.22222222
## 16.7 0.00000000 1.00000000
## 17.4 0.00000000 1.00000000
## 17.8 1.00000000 0.00000000
## 18 1.00000000 0.00000000
## 18.75 0.00000000 1.00000000
## 18.7875 0.50000000 0.50000000
## 19.2583 0.00000000 1.00000000
## 19.5 0.00000000 1.00000000
## 19.9667 1.00000000 0.00000000
## 20.2125 1.00000000 0.00000000
## 20.25 0.50000000 0.50000000
## 20.525 0.33333333 0.66666667
## 20.575 0.50000000 0.50000000
## 21 0.66666667 0.33333333
## 21.075 1.00000000 0.00000000
## 21.6792 1.00000000 0.00000000
## 22.025 0.00000000 1.00000000
## 22.3583 0.00000000 1.00000000
## 22.525 1.00000000 0.00000000
## 23 0.00000000 1.00000000
## 23.25 0.00000000 1.00000000
## 23.45 1.00000000 0.00000000
## 24 0.50000000 0.50000000
## 24.15 0.87500000 0.12500000
## 25.4667 1.00000000 0.00000000
## 25.5875 1.00000000 0.00000000
## 25.7
## 25.7417
## 25.925 1.00000000 0.00000000
## 25.9292 0.00000000 1.00000000
## 26 0.51612903 0.48387097
## 26.25 0.33333333 0.66666667
## 26.2833 0.00000000 1.00000000
## 26.2875 0.00000000 1.00000000
## 26.3875 0.00000000 1.00000000
## 26.55 0.46666667 0.53333333
## 27 0.50000000 0.50000000
## 27.4458
## 27.7208 0.80000000 0.20000000
## 27.75 0.50000000 0.50000000
## 27.9 1.00000000 0.00000000
## 28.5 1.00000000 0.00000000
## 28.5375
## 28.7125 1.00000000 0.00000000
## 29 0.00000000 1.00000000
## 29.125 1.00000000 0.00000000
## 29.7 0.66666667 0.33333333
## 30 0.16666667 0.83333333
## 30.0708 0.50000000 0.50000000
## 30.5 0.20000000 0.80000000
## 30.6958 1.00000000 0.00000000
## 31 0.33333333 0.66666667
## 31.275 1.00000000 0.00000000
## 31.3875 0.25000000 0.75000000
## 31.5
## 31.6792
## 31.6833
## 32.3208 1.00000000 0.00000000
## 32.5 0.00000000 1.00000000
## 33 0.33333333 0.66666667
## 33.2954792813456
## 33.5 1.00000000 0.00000000
## 34.0208 1.00000000 0.00000000
## 34.375 1.00000000 0.00000000
## 34.6542 1.00000000 0.00000000
## 35 1.00000000 0.00000000
## 35.5 0.25000000 0.75000000
## 36.75 0.50000000 0.50000000
## 37.0042 0.50000000 0.50000000
## 38.5 1.00000000 0.00000000
## 39 0.25000000 0.75000000
## 39.4 0.00000000 1.00000000
## 39.6 0.50000000 0.50000000
## 39.6875 1.00000000 0.00000000
## 40.125 1.00000000 0.00000000
## 41.5792 0.33333333 0.66666667
## 42.4 1.00000000 0.00000000
## 42.5
## 45.5
## 46.9 1.00000000 0.00000000
## 47.1 1.00000000 0.00000000
## 49.5 0.00000000 1.00000000
## 49.5042 0.50000000 0.50000000
## 50 1.00000000 0.00000000
## 50.4958 1.00000000 0.00000000
## 51.4792 0.00000000 1.00000000
## 51.8625 0.50000000 0.50000000
## 52 0.57142857 0.42857143
## 52.5542 0.00000000 1.00000000
## 53.1 0.40000000 0.60000000
## 55 0.00000000 1.00000000
## 55.4417 0.00000000 1.00000000
## 55.9 0.50000000 0.50000000
## 56.4958 0.28571429 0.71428571
## 56.9292 0.00000000 1.00000000
## 57 0.00000000 1.00000000
## 57.75
## 57.9792 0.00000000 1.00000000
## 59.4 0.00000000 1.00000000
## 60
## 61.175 1.00000000 0.00000000
## 61.3792 1.00000000 0.00000000
## 61.9792 1.00000000 0.00000000
## 63.3583 0.00000000 1.00000000
## 65 0.00000000 1.00000000
## 66.6 0.50000000 0.50000000
## 69.3 0.00000000 1.00000000
## 69.55 1.00000000 0.00000000
## 71 0.50000000 0.50000000
## 71.2833 0.00000000 1.00000000
## 73.5 1.00000000 0.00000000
## 75.2417
## 75.25 0.00000000 1.00000000
## 76.2917 0.00000000 1.00000000
## 76.7292 0.00000000 1.00000000
## 77.2875 1.00000000 0.00000000
## 77.9583 0.00000000 1.00000000
## 78.2667 0.00000000 1.00000000
## 78.85 0.50000000 0.50000000
## 79.2 0.50000000 0.50000000
## 79.65 0.33333333 0.66666667
## 80 0.00000000 1.00000000
## 81.8583 0.00000000 1.00000000
## 82.1708 0.50000000 0.50000000
## 82.2667
## 83.1583 0.00000000 1.00000000
## 83.475 0.50000000 0.50000000
## 86.5 0.00000000 1.00000000
## 89.1042 0.00000000 1.00000000
## 90 0.25000000 0.75000000
## 91.0792 0.00000000 1.00000000
## 93.5 0.00000000 1.00000000
## 106.425 0.50000000 0.50000000
## 108.9 0.50000000 0.50000000
## 110.8833 0.25000000 0.75000000
## 113.275 0.33333333 0.66666667
## 120 0.00000000 1.00000000
## 133.65 0.00000000 1.00000000
## 134.5 0.00000000 1.00000000
## 135.6333 0.33333333 0.66666667
## 136.7792
## 146.5208 0.00000000 1.00000000
## 151.55 0.50000000 0.50000000
## 153.4625 0.33333333 0.66666667
## 164.8667 0.00000000 1.00000000
## 211.3375 0.00000000 1.00000000
## 211.5 1.00000000 0.00000000
## 221.7792 1.00000000 0.00000000
## 227.525 0.25000000 0.75000000
## 247.5208 0.50000000 0.50000000
## 262.375 0.00000000 1.00000000
## 263 0.50000000 0.50000000
## 512.3292 0.00000000 1.00000000
##
## $Embarked
##
## x 0 1
## 0.0000000 1.0000000
## C 0.4464286 0.5535714
## Q 0.6103896 0.3896104
## S 0.6630435 0.3369565
##
## $Deck
##
## x 0 1
## 0.7001456 0.2998544
## A 0.5333333 0.4666667
## B 0.2553191 0.7446809
## C 0.4067797 0.5932203
## D 0.2424242 0.7575758
## E 0.2500000 0.7500000
## F 0.3846154 0.6153846
## G 0.5000000 0.5000000
## T 1.0000000 0.0000000
##
## $Location
##
## x 0 1
## The Bow 0.3076923 0.6923077
## The Middle 0.3052632 0.6947368
## The Rear 0.3673469 0.6326531
##
## $Age_cat
##
## x 0 1
## Adult 0.6373762 0.3626238
## Child 0.4096386 0.5903614
##
## $family_size
##
## x 0 1
## 1 0.6964618 0.3035382
## 2 0.4472050 0.5527950
## 3 0.4215686 0.5784314
## 4 0.2758621 0.7241379
## 5 0.8000000 0.2000000
## 6 0.8636364 0.1363636
## 7 0.6666667 0.3333333
## 8 1.0000000 0.0000000
## 11 1.0000000 0.0000000
..sex and infancy seem to be the crucical predictors.
Looks promising
Finally, let’s split the data set again and standaride the numerics
train <- titanic[1:891,]
test <- titanic[892:1309,]
train <- as.data.frame(train)
test <- as.data.frame(test)
test$Survived <- ""
titanic_pp <- preProcess(train, method = c("center", "scale"))
titanic_modelling <- predict(titanic_pp, newdata = train)
#attach(titanic_modelling)
Let’s start with logistic regression model
logistic.regression.model <- glm(Survived ~ ., family = binomial(link = "logit"), data = titanic_modelling)
Let’s test the model
anova(logistic.regression.model, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Survived
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 195 246.15
## PassengerId 1 2.692 194 243.46 0.100840
## Pclass 1 0.338 193 243.12 0.560997
## Sex 1 62.216 192 180.91 3.078e-15 ***
## Age 1 7.214 191 173.69 0.007232 **
## SibSp 1 0.275 190 173.42 0.599673
## Parch 1 0.990 189 172.43 0.319715
## Fare 1 0.279 188 172.15 0.597119
## Embarked 3 3.328 185 168.82 0.343697
## Deck 6 13.106 179 155.71 0.041388 *
## Location 2 2.397 177 153.32 0.301660
## Age_cat 1 0.655 176 152.66 0.418475
## family_size 4 2.941 172 149.72 0.567809
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
varImp(logistic.regression.model)
## Overall
## PassengerId 0.970350559
## Pclass 0.617616258
## Sexmale 5.661213193
## Age 2.275369797
## SibSp 0.485112383
## Parch 1.642091028
## Fare 0.812727039
## EmbarkedC 0.007803637
## EmbarkedQ 0.009226067
## EmbarkedS 0.008288520
## DeckB 0.063501720
## DeckC 0.603912015
## DeckD 0.499807776
## DeckE 0.953100938
## DeckF 0.446196542
## DeckG 2.160011337
## LocationThe Middle 0.307049564
## LocationThe Rear 1.646764750
## Age_catChild 0.651022534
## family_size2 0.501579984
## family_size3 1.594096181
## family_size4 0.582385628
## family_size5 0.008161283
Several variables have very low deviance - let’s get rid of them: Sibsp, Fare, PcClass, Parch. I assume there might be a multicolinearity problem so let’s get rid of Age too
tm <- titanic_modelling %>% select(-one_of(c("SibSp", "Fare","Embarked","Age","Parch")))
And try one more time
logistic.regression.model.smaller <- glm(Survived ~ ., family = binomial(link = "logit"), data = tm)
anova(logistic.regression.model.smaller, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Survived
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 195 246.15
## PassengerId 1 2.692 194 243.46 0.10084
## Pclass 1 0.338 193 243.12 0.56100
## Sex 1 62.216 192 180.91 3.078e-15 ***
## Deck 6 10.061 186 170.85 0.12210
## Location 2 1.864 184 168.98 0.39367
## Age_cat 1 4.359 183 164.62 0.03681 *
## family_size 5 2.267 178 162.35 0.81104
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
varImp(logistic.regression.model.smaller)
## Overall
## PassengerId 1.46723017
## Pclass 0.39644876
## Sexmale 5.98112715
## DeckB 0.19257967
## DeckC 0.49529588
## DeckD 0.38152506
## DeckE 0.58664719
## DeckF 0.44579884
## DeckG 2.16893780
## LocationThe Middle 0.50937648
## LocationThe Rear 1.45294779
## Age_catChild 1.69465374
## family_size2 0.96109478
## family_size3 0.88469921
## family_size4 0.33283567
## family_size5 0.01266958
## family_size6 0.39541062
Cross-validation
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 5,
repeats = 5
)
This part used to work before i updated R, now i am even more clueless
# gradient boosting
#gbmFit1 <- train(Survived ~ ., data = tm,
# method = "gbm",
# trControl = fitControl,
# verbose = FALSE)
# random forest
#rfFit1 <- train(Survived ~ ., data = tm,
# method = "rf",
# trControl = fitControl)
# neural network
#nnFit1 <- train(Survived ~ ., data = tm,
# method = "avNNet", trControl = fitControl, linout = TRUE)
#summary(gbmFit1)
#plot(gbmFit1)
#summary(rfFit1)
#plot(rfFit1)
#summary(nnFit1)
#plot(nnFit1)
#log.predictions <- predict(logistic.regression.model, newdata = test)
#log.predictions.smaller <- predict(logistic.regression.model.smaller, newdata = test)
#gbm.predictions <- predict(gbmFit1, newdata = test)
#rf.predictions <- predict(rfFit1, newdata = test)
#nnet.predictions <- predict(nnFit1, newdata = test)
#predicted values for traindata:
#confusionMatrix(gbm.predictions,test$Survived)
#confusionMatrix(rf.predictions,test$Survived)
#confusionMatrix(nnet.predictions,test$Survived)
I was experiencing problems with confusion matrix, roc curve etc. I guess i did something wrong but i was not able to find it
#solution1 <- data.frame(PassengerID = test$PassengerId, Survived = gbm.predictions)
#solution2 <- data.frame(PassengerID = test$PassengerId, Survived = rf.predictions)
#solution3 <- data.frame(PassengerID = test$PassengerId, Survived = nnet.predictions)
# Write the solution to file
#write.csv(solution1, file = 'gbm_mod_Solution.csv', row.names = F)
#write.csv(solution2, file = 'rf_mod_Solution.csv', row.names = F)
#write.csv(solution3, file = 'nnet_mod_Solution.csv', row.names = F)