Machine learning intro in R: Titanic binary classification

Titanic: binary classification example

Packages

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library (ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
library(titanic)
#library(sparklyr)
#sc <- spark_connect(master = "local")

Data:

test <- titanic_test
train<- titanic_train

head (test)
##   PassengerId Pclass                                         Name    Sex
## 1         892      3                             Kelly, Mr. James   male
## 2         893      3             Wilkes, Mrs. James (Ellen Needs) female
## 3         894      2                    Myles, Mr. Thomas Francis   male
## 4         895      3                             Wirz, Mr. Albert   male
## 5         896      3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female
## 6         897      3                   Svensson, Mr. Johan Cervin   male
##    Age SibSp Parch  Ticket    Fare Cabin Embarked
## 1 34.5     0     0  330911  7.8292              Q
## 2 47.0     1     0  363272  7.0000              S
## 3 62.0     0     0  240276  9.6875              Q
## 4 27.0     0     0  315154  8.6625              S
## 5 22.0     1     1 3101298 12.2875              S
## 6 14.0     0     0    7538  9.2250              S
str(train)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...
dim(test)
## [1] 418  11
dim(train)
## [1] 891  12

Turn Survived variable from int to factor

train$Survived<- factor(train$Survived)

Create Survived dummy variable in test before combining

test<- mutate(test, Survived = "none")

Create sorting variable dataset before combining (so we can re-sort later)

test  <- mutate(test,  dataset = "testset")
train <- mutate(train, dataset = "trainset")

Combine datasets for feature engineering

titanic.combined <- rbind(test, train)
str(titanic.combined)
## 'data.frame':    1309 obs. of  13 variables:
##  $ PassengerId: int  892 893 894 895 896 897 898 899 900 901 ...
##  $ Pclass     : int  3 3 2 3 3 3 3 2 3 3 ...
##  $ Name       : chr  "Kelly, Mr. James" "Wilkes, Mrs. James (Ellen Needs)" "Myles, Mr. Thomas Francis" "Wirz, Mr. Albert" ...
##  $ Sex        : chr  "male" "female" "male" "male" ...
##  $ Age        : num  34.5 47 62 27 22 14 30 26 18 21 ...
##  $ SibSp      : int  0 1 0 0 1 0 0 1 0 2 ...
##  $ Parch      : int  0 0 0 0 1 0 0 1 0 0 ...
##  $ Ticket     : chr  "330911" "363272" "240276" "315154" ...
##  $ Fare       : num  7.83 7 9.69 8.66 12.29 ...
##  $ Cabin      : chr  "" "" "" "" ...
##  $ Embarked   : chr  "Q" "S" "Q" "S" ...
##  $ Survived   : chr  "none" "none" "none" "none" ...
##  $ dataset    : chr  "testset" "testset" "testset" "testset" ...

Make local df for simplicity

data<- tbl_df (titanic.combined)

Factorize vars Pclass, dataset, Survived

data$Pclass <- factor(data$Pclass)
data$dataset <- factor(data$dataset)
data$Survived<- factor(data$Survived)

Check for duplicates

IDdups <- distinct(data, PassengerId)
dim(IDdups)
## [1] 1309    1

PassengerId values are all distinct.

Namedups <- distinct(data, Name)
dim(Namedups)
## [1] 1307    1

There are 2 duplicate names, which is possible. But, all PassengerId are distinct. Lets investigate:

filter(data, duplicated(Name)) 
## Warning: package 'bindrcpp' was built under R version 3.2.5

## # A tibble: 2 x 13
##   PassengerId Pclass                 Name    Sex   Age SibSp Parch Ticket
##         <int> <fctr>                <chr>  <chr> <dbl> <int> <int>  <chr>
## 1         290      3 Connolly, Miss. Kate female    22     0     0 370373
## 2         697      3     Kelly, Mr. James   male    44     0     0 363592
## # ... with 5 more variables: Fare <dbl>, Cabin <chr>, Embarked <chr>,
## #   Survived <fctr>, dataset <fctr>
filter(data, grepl('Kelly|Connolly', Name, Age ))
## # A tibble: 7 x 13
##   PassengerId Pclass                                         Name    Sex
##         <int> <fctr>                                        <chr>  <chr>
## 1         892      3                             Kelly, Mr. James   male
## 2         898      3                         Connolly, Miss. Kate female
## 3         290      3                         Connolly, Miss. Kate female
## 4         301      3 "Kelly, Miss. Anna Katherine \"Annie Kate\"" female
## 5         574      3                            Kelly, Miss. Mary female
## 6         697      3                             Kelly, Mr. James   male
## 7         707      2            "Kelly, Mrs. Florence \"Fannie\"" female
## # ... with 9 more variables: Age <dbl>, SibSp <int>, Parch <int>,
## #   Ticket <chr>, Fare <dbl>, Cabin <chr>, Embarked <chr>,
## #   Survived <fctr>, dataset <fctr>

missing values?

sapply(data, function(x) sum(is.na(x)))
## PassengerId      Pclass        Name         Sex         Age       SibSp 
##           0           0           0           0         263           0 
##       Parch      Ticket        Fare       Cabin    Embarked    Survived 
##           0           0           1           0           0           0 
##     dataset 
##           0

Missing age in a lot of these. Consider dropping or filling

Drop na:

# data <- na.omit(data)

Finally, lets look at some descriptive statistics:

summary(tbl_df(data))
##   PassengerId   Pclass      Name               Sex           
##  Min.   :   1   1:323   Length:1309        Length:1309       
##  1st Qu.: 328   2:277   Class :character   Class :character  
##  Median : 655   3:709   Mode  :character   Mode  :character  
##  Mean   : 655                                                
##  3rd Qu.: 982                                                
##  Max.   :1309                                                
##                                                              
##       Age            SibSp            Parch          Ticket         
##  Min.   : 0.17   Min.   :0.0000   Min.   :0.000   Length:1309       
##  1st Qu.:21.00   1st Qu.:0.0000   1st Qu.:0.000   Class :character  
##  Median :28.00   Median :0.0000   Median :0.000   Mode  :character  
##  Mean   :29.88   Mean   :0.4989   Mean   :0.385                     
##  3rd Qu.:39.00   3rd Qu.:1.0000   3rd Qu.:0.000                     
##  Max.   :80.00   Max.   :8.0000   Max.   :9.000                     
##  NA's   :263                                                        
##       Fare            Cabin             Embarked         Survived  
##  Min.   :  0.000   Length:1309        Length:1309        0   :549  
##  1st Qu.:  7.896   Class :character   Class :character   1   :342  
##  Median : 14.454   Mode  :character   Mode  :character   none:418  
##  Mean   : 33.295                                                   
##  3rd Qu.: 31.275                                                   
##  Max.   :512.329                                                   
##  NA's   :1                                                         
##      dataset   
##  testset :418  
##  trainset:891  
##                
##                
##                
##                
## 
head(data)
## # A tibble: 6 x 13
##   PassengerId Pclass                                         Name    Sex
##         <int> <fctr>                                        <chr>  <chr>
## 1         892      3                             Kelly, Mr. James   male
## 2         893      3             Wilkes, Mrs. James (Ellen Needs) female
## 3         894      2                    Myles, Mr. Thomas Francis   male
## 4         895      3                             Wirz, Mr. Albert   male
## 5         896      3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female
## 6         897      3                   Svensson, Mr. Johan Cervin   male
## # ... with 9 more variables: Age <dbl>, SibSp <int>, Parch <int>,
## #   Ticket <chr>, Fare <dbl>, Cabin <chr>, Embarked <chr>,
## #   Survived <fctr>, dataset <fctr>

Viz for analysis

Take slice of data to investigate distributional properties

trainset<-na.omit(data)%>% arrange(dataset)%>%slice(419:1309)
head (trainset)
## # A tibble: 6 x 13
##   PassengerId Pclass                            Name    Sex   Age SibSp
##         <int> <fctr>                           <chr>  <chr> <dbl> <int>
## 1         113      3          Barton, Mr. David John   male  22.0     0
## 2         114      3         Jussila, Miss. Katriina female  20.0     1
## 3         115      3           Attalah, Miss. Malake female  17.0     0
## 4         116      3           Pekoniemi, Mr. Edvard   male  21.0     0
## 5         117      3            Connors, Mr. Patrick   male  70.5     0
## 6         118      2 Turpin, Mr. William John Robert   male  29.0     1
## # ... with 7 more variables: Parch <int>, Ticket <chr>, Fare <dbl>,
## #   Cabin <chr>, Embarked <chr>, Survived <fctr>, dataset <fctr>
glimpse(trainset);
## Observations: 627
## Variables: 13
## $ PassengerId <int> 113, 114, 115, 116, 117, 118, 119, 120, 121, 123, ...
## $ Pclass      <fctr> 3, 3, 3, 3, 3, 2, 1, 3, 2, 2, 2, 1, 3, 3, 3, 3, 3...
## $ Name        <chr> "Barton, Mr. David John", "Jussila, Miss. Katriina...
## $ Sex         <chr> "male", "female", "female", "male", "male", "male"...
## $ Age         <dbl> 22.0, 20.0, 17.0, 21.0, 70.5, 29.0, 24.0, 2.0, 21....
## $ SibSp       <int> 0, 1, 0, 0, 0, 1, 0, 4, 2, 1, 0, 0, 1, 0, 0, 0, 0,...
## $ Parch       <int> 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
## $ Ticket      <chr> "324669", "4136", "2627", "STON/O 2. 3101294", "37...
## $ Fare        <dbl> 8.0500, 9.8250, 14.4583, 7.9250, 7.7500, 21.0000, ...
## $ Cabin       <chr> "", "", "", "", "", "", "B58 B60", "", "", "", "E1...
## $ Embarked    <chr> "S", "S", "C", "S", "Q", "S", "C", "S", "S", "C", ...
## $ Survived    <fctr> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0...
## $ dataset     <fctr> trainset, trainset, trainset, trainset, trainset,...

Plotting helper function:

# MULTIPLOT  -------------------------------------------
multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
  require(grid)

  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  if (is.null(layout)) {
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

  if (numPlots == 1) {
    print(plots[[1]])

  } else {
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    for (i in 1:numPlots) {
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

# ---------------------------------------------------
# BAR PLOT -------------------------------------------
plot_ggbar <- function(df_arg, var_arg, colNum){
  p1<-  ggplot(na.omit(df_arg),  aes((factor(var_arg)), fill = factor(df_arg$Survived))) + 
    geom_bar(width = .95)  # defaults to stacking 

    
  p2<- ggplot(na.omit(df_arg), aes(factor(var_arg), fill = factor(df_arg$Survived))) +
    geom_bar(position = "fill",width = .95) 

  multiplot(p1,p2,cols=colNum)

}
# -------------------------------------------

AGE

plot_ggbar(trainset, trainset$Age, 1)
## Loading required package: grid

SEX

plot_ggbar(trainset, trainset$Sex, 2)

Pclass

plot_ggbar(trainset, trainset$Pclass, 2)

Fare

plot_ggbar(trainset, trainset$Fare, 1)

#library(e1071)