1 Loading the data
2 Combining train and test
- 2.1 Convert variable to factors
- 2.2 Exploratory Data
3 Variable explanation
4 Remove “unuseful” variable
5 Missing Value Imputation

Titanic competition from Kaggle. Part 1, Missing data is imputed using

kNN in the preProcess function in the Caret
median imputation in the preProcess function in the Caret
mice package

library(tidyverse)
library(DataExplorer)
library(lubridate)
library(pander)
library(data.table)
library(grid)
library(gridExtra)
library(mice)
library(caret)

1 Loading the data

titanic.train <- read_csv("train.csv",col_names = TRUE)

titanic.test <- read_csv("test.csv",col_names = TRUE)

Add Survived column to the test dataset to match the column in the train dataset

titanic.test$Survived <- NA

2 Combining train and test

Combine train and test dataset for cleaning

titanic <- rbind (titanic.test,titanic.train)

## Warning in if (attr(list(...)[[1]], "class") == "mids")
## return(rbind.mids(...)) else return(base::rbind(...)): the condition has
## length > 1 and only the first element will be used

head(titanic,6)

colnames(titanic)

##  [1] "PassengerId" "Pclass"      "Name"        "Sex"         "Age"        
##  [6] "SibSp"       "Parch"       "Ticket"      "Fare"        "Cabin"      
## [11] "Embarked"    "Survived"

2.1 Convert variable to factors

Using map function in the Purrr package to convert multiple variables into factor at once

factor_variable <- c("Survived","Pclass","Sex","Embarked")

titanic[factor_variable] <- map(titanic[factor_variable],as.factor)

str(titanic)

## Classes 'tbl_df', 'tbl' and 'data.frame':    1309 obs. of  12 variables:
##  $ PassengerId: int  892 893 894 895 896 897 898 899 900 901 ...
##  $ Pclass     : Factor w/ 3 levels "1","2","3": 3 3 2 3 3 3 3 2 3 3 ...
##  $ Name       : chr  "Kelly, Mr. James" "Wilkes, Mrs. James (Ellen Needs)" "Myles, Mr. Thomas Francis" "Wirz, Mr. Albert" ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 2 2 1 2 1 2 1 2 ...
##  $ Age        : num  34.5 47 62 27 22 14 30 26 18 21 ...
##  $ SibSp      : int  0 1 0 0 1 0 0 1 0 2 ...
##  $ Parch      : int  0 0 0 0 1 0 0 1 0 0 ...
##  $ Ticket     : chr  "330911" "363272" "240276" "315154" ...
##  $ Fare       : num  7.83 7 9.69 8.66 12.29 ...
##  $ Cabin      : chr  NA NA NA NA ...
##  $ Embarked   : Factor w/ 3 levels "C","Q","S": 2 3 2 3 3 3 2 3 1 3 ...
##  $ Survived   : Factor w/ 2 levels "0","1": NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 11
##   .. ..$ PassengerId: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Pclass     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Name       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Sex        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Age        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ SibSp      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Parch      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Ticket     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Fare       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Cabin      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Embarked   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

2.2 Exploratory Data

check on missing values in multiple columns in the data frame

method is from https://sebastiansauer.github.io/sum-isna/

map(titanic,~sum(is.na(.)))

## $PassengerId
## [1] 0
## 
## $Pclass
## [1] 0
## 
## $Name
## [1] 0
## 
## $Sex
## [1] 0
## 
## $Age
## [1] 263
## 
## $SibSp
## [1] 0
## 
## $Parch
## [1] 0
## 
## $Ticket
## [1] 0
## 
## $Fare
## [1] 1
## 
## $Cabin
## [1] 1014
## 
## $Embarked
## [1] 2
## 
## $Survived
## [1] 418

3 Variable explanation

Variable Name Description Survived Survived (1) or died (0) Pclass Passenger’s class Name Passenger’s name Sex Passenger’s sex Age Passenger’s age SibSp Number of siblings/spouses aboard Parch Number of parents/children aboard Ticket Ticket number Fare Fare Cabin Cabin Embarked Port of embarkation

4 Remove “unuseful” variable

The following variables might not be relevant for survival, remove for this try.

Name, Ticket, Fare, Cabin, Embark.

Fare and cabin might be correspond with Pclass

titanic.2 is the dataset without Name,ticket, Fare, Carbon and Embarked variables.

titanic.2 <- titanic %>% select(-c(Name, Ticket, Fare, Cabin, Embarked))

colnames(titanic.2)

## [1] "PassengerId" "Pclass"      "Sex"         "Age"         "SibSp"      
## [6] "Parch"       "Survived"

map(titanic.2,~sum(is.na(.)))

## $PassengerId
## [1] 0
## 
## $Pclass
## [1] 0
## 
## $Sex
## [1] 0
## 
## $Age
## [1] 263
## 
## $SibSp
## [1] 0
## 
## $Parch
## [1] 0
## 
## $Survived
## [1] 418

Now only Age variable has missing values.

5 Missing Value Imputation

Age distribution

plot_histogram(titanic.2$Age)

5.1 Knn - preProcess Caret Package

The knn in preProcess in the Caret Package transforms the data.

titanic.kNN is the dataset processed by preProcess function in the Caret.

colnames(titanic.2)

## [1] "PassengerId" "Pclass"      "Sex"         "Age"         "SibSp"      
## [6] "Parch"       "Survived"

AgePredict <- preProcess(method ="knnImpute",x =titanic.2[,-c(1,7)])

titanic.kNN <- predict(AgePredict,newdata=titanic.2)

summary(titanic.kNN$Age)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -2.061342 -0.546789  0.077626 -0.000672  0.355144  3.477218

After using knnImpute in the preProcess function in the caret. the Age data is transformed.

plot_histogram(titanic.kNN$Age)

5.2 Median Impute Caret

titanic.median is the median impute using Caret

AgePredict <- preProcess(method ="medianImpute",x =titanic.2[,-c(1,7)])

titanic.median <- predict(AgePredict,newdata=titanic.2)

plot_histogram(titanic.median$Age)

check on the difference between original data,titanic.2 and the median imputed data titanic.median

summary(titanic.median$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.17   22.00   28.00   29.50   35.00   80.00

summary(titanic.2$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.17   21.00   28.00   29.88   39.00   80.00     263

5.3 MICE package for imputation

Introduction mice https://www.rdocumentation.org/packages/mice/versions/3.0.0/topics/mice https://datascienceplus.com/imputing-missing-data-with-r-mice-package/

names(titanic.2)

## [1] "PassengerId" "Pclass"      "Sex"         "Age"         "SibSp"      
## [6] "Parch"       "Survived"

use micepackage to impute missing value in Age variable, method = rf is random forest.

mice imputation has two steps.

1st step, micefunction. 2nd step, completefiction.

miceimpute <- mice(titanic.2[,!names(titanic.2) %in% c("PassengerId","Survived")], method = "rf")

## 
##  iter imp variable
##   1   1  Age
##   1   2  Age
##   1   3  Age
##   1   4  Age
##   1   5  Age
##   2   1  Age
##   2   2  Age
##   2   3  Age
##   2   4  Age
##   2   5  Age
##   3   1  Age
##   3   2  Age
##   3   3  Age
##   3   4  Age
##   3   5  Age
##   4   1  Age
##   4   2  Age
##   4   3  Age
##   4   4  Age
##   4   5  Age
##   5   1  Age
##   5   2  Age
##   5   3  Age
##   5   4  Age
##   5   5  Age

micecomplete<- complete(miceimpute)

The micecomplete dataset only have 5 variable, PassengerID and Survived variables are removed by mice

str(micecomplete)

## 'data.frame':    1309 obs. of  5 variables:
##  $ Pclass: Factor w/ 3 levels "1","2","3": 3 3 2 3 3 3 3 2 3 3 ...
##  $ Sex   : Factor w/ 2 levels "female","male": 2 1 2 2 1 2 1 2 1 2 ...
##  $ Age   : num  34.5 47 62 27 22 14 30 26 18 21 ...
##  $ SibSp : int  0 1 0 0 1 0 0 1 0 2 ...
##  $ Parch : int  0 0 0 0 1 0 0 1 0 0 ...

compare data imputed with original data

summary(micecomplete$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.17   21.00   28.00   29.45   37.00   80.00

summary(titanic.2$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.17   21.00   28.00   29.88   39.00   80.00     263

Assign Age variable in the micecomplete dataset into titanic.mice Age variable. Now titanic.mice dataset has 7 variables.

titanic.mice <- titanic.2
titanic.mice$Age <- micecomplete$Age
anyNA(titanic.mice$Age)

## [1] FALSE

plot_histogram(titanic.mice$Age)

export mice dataset

write_csv(titanic.mice,"titanic_mice.csv")

Kaggle-Titanic-Caret-1

Ming Si

June 07 2018