Link to the project in RPubs: http://rpubs.com/ofomicheva86/390683

#required packages
library(knitr)
library(rcompanion)
library(corrplot)
library(PerformanceAnalytics)
library(GGally)
library(RColorBrewer)
library(VIM)
library(dplyr)
library(tidyr)
library(mice)
library(pROC)
library(caret)
library(pscl)
library(ResourceSelection)
library(car)
library(speedglm)
library(gdata)
library(ggplot2)
library(RWeka)

1.DATA EXPLORATION

The dataset contains the variables described below:

  1. ‘Survived’- survival (0 = No, 1 = Yes)
  2. ‘Pclass’ - ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
  3. ‘Sex’ - sex
  4. ‘Age’ - age in years
  5. ‘Sibsp’ - number of siblings / spouses aboard the Titanic
  6. ‘Parch’ - number of parents / children aboard the Titanic
  7. ‘Ticket’ - ticket number
  8. ‘Fare’ - passenger fare
  9. ‘Cabin’ - cabin number
    10.‘Embarked’ - port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

Read training and testing datasets

#read training data set
#replace blanks with NAs
data <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/titanic_train.csv",
header=T, na.strings=c("","NA"))

#read testing data set
data_testing <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/titanic_test.csv",
header=T, na.strings=c("","NA"))

#display first six entries
head(data)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp
## 1                             Braund, Mr. Owen Harris   male  22     1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1
## 3                              Heikkinen, Miss. Laina female  26     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1
## 5                            Allen, Mr. William Henry   male  35     0
## 6                                    Moran, Mr. James   male  NA     0
##   Parch           Ticket    Fare Cabin Embarked
## 1     0        A/5 21171  7.2500  <NA>        S
## 2     0         PC 17599 71.2833   C85        C
## 3     0 STON/O2. 3101282  7.9250  <NA>        S
## 4     0           113803 53.1000  C123        S
## 5     0           373450  8.0500  <NA>        S
## 6     0           330877  8.4583  <NA>        Q
#find dimentions
dim(data)
## [1] 891  12
levels(factor(data$Parch))
## [1] "0" "1" "2" "3" "4" "5" "6"

Count missing values.

#chart for missing values
aggr(data[-1], prop = T, numbers = T, cex.axis=.8, 
     ylab=c("Proportion of missingness","Missingness Pattern"),
     labels=names(data[-1]))

#build function that counts missing values
count_nas <- function(data){
  
variable_name_column <- c()
number_missing_column <- c()

for (i in 2:ncol(data)){
  variable_name <- colnames(data[i])
  number_missing <- sum(is.na(data[i]))
  variable_name_column <- c(variable_name_column,variable_name)
  number_missing_column <- c(number_missing_column,number_missing)
}

missing_table <- data.frame(variable_name_column,number_missing_column)
missing_table <- missing_table %>% mutate(percentage=round(number_missing_column*100/nrow(data),2)) %>% arrange(desc(percentage))
missing_table
}

#build function that counts negative values
count_neg <- function(data){
  
variable_name_column <- c()
number_negative_column <- c()  


for (i in 3:ncol(data)){
  neg_count <- 0
  variable_name <- colnames(data[i])
  for (j in 1:nrow(data)){
    if(is.numeric(data[j,i]) && !is.na(data[j,i]) && data[j,i] < 0) {
      neg_count <- neg_count + 1
    }
    }
    number_negative_column <- c(number_negative_column,neg_count)
    variable_name_column  <- c(variable_name_column,variable_name) 
  }


negative_table <- data.frame(variable_name_column,number_negative_column)
negative_table <- negative_table %>% mutate(percentage=round(number_negative_column*100/nrow(data),2)) %>% arrange(desc(percentage))
negative_table
}

#build function that counts 0s
count_zeros <- function(data){
  
variable_name_column <- c()
number_zero_column <- c()  


for (i in 3:ncol(data)){
  zero_count <- 0
  variable_name <- colnames(data[i])
  for (j in 1:nrow(data)){
    if(is.numeric(data[j,i]) && !is.na(data[j,i]) && data[j,i] == 0) {
      zero_count <- zero_count + 1
    }
    }
    number_zero_column <- c(number_zero_column,zero_count)
    variable_name_column  <- c(variable_name_column,variable_name) 
  }


zero_table <- data.frame(variable_name_column,number_zero_column)
zero_table <- zero_table %>% mutate(percentage=round(number_zero_column*100/nrow(data),2)) %>% arrange(desc(percentage))
zero_table
}

#count NAs
count_nas(data)
## Warning: package 'bindrcpp' was built under R version 3.4.4
##    variable_name_column number_missing_column percentage
## 1                 Cabin                   687      77.10
## 2                   Age                   177      19.87
## 3              Embarked                     2       0.22
## 4              Survived                     0       0.00
## 5                Pclass                     0       0.00
## 6                  Name                     0       0.00
## 7                   Sex                     0       0.00
## 8                 SibSp                     0       0.00
## 9                 Parch                     0       0.00
## 10               Ticket                     0       0.00
## 11                 Fare                     0       0.00
  1. DATA PREPARATION

Create additional variables based on existing variables.

#split the variable 'Name' into 'First_Name', 'Last Name' and 'Salutation'
data <- data %>% separate(Name, c("Last_Name", "name"), sep = "\\,\\s+", na.rm=T)  %>% separate(name, c("Salutation", "First_Name"), sep = "\\.\\s+", na.rm=T) 
## Warning: Expected 2 pieces. Additional pieces discarded in 1 rows [514].
#split the variable 'Cabin' into 'Floor' and "Room"
data <- data %>% mutate(Cabin2=Cabin)  %>% separate(Cabin, c("Floor", "Cabin"), sep = "[:digit:]", na.rm=T) %>% separate(Cabin2, c("Cabin2", "Room"), sep = "[:alpha:]", na.rm=T) %>% dplyr::select(-Cabin, -Cabin2) 
## Warning: Expected 2 pieces. Additional pieces discarded in 176 rows [2,
## 4, 7, 12, 22, 28, 32, 53, 55, 56, 62, 63, 67, 76, 89, 93, 98, 103, 111,
## 119, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 4 rows [293,
## 328, 340, 474].
## Warning: Expected 2 pieces. Additional pieces discarded in 24 rows [28, 76,
## 89, 98, 119, 129, 298, 300, 306, 312, 342, 391, 436, 439, 499, 680, 700,
## 701, 716, 743, ...].
head(data)
##   PassengerId Survived Pclass Last_Name Salutation
## 1           1        0      3    Braund         Mr
## 2           2        1      1   Cumings        Mrs
## 3           3        1      3 Heikkinen       Miss
## 4           4        1      1  Futrelle        Mrs
## 5           5        0      3     Allen         Mr
## 6           6        0      3     Moran         Mr
##                              First_Name    Sex Age SibSp Parch
## 1                           Owen Harris   male  22     1     0
## 2 John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                                 Laina female  26     0     0
## 4         Jacques Heath (Lily May Peel) female  35     1     0
## 5                         William Henry   male  35     0     0
## 6                                 James   male  NA     0     0
##             Ticket    Fare Floor Embarked Room
## 1        A/5 21171  7.2500  <NA>        S <NA>
## 2         PC 17599 71.2833     C        C   85
## 3 STON/O2. 3101282  7.9250  <NA>        S <NA>
## 4           113803 53.1000     C        S  123
## 5           373450  8.0500  <NA>        S <NA>
## 6           330877  8.4583  <NA>        Q <NA>
colnames(data)
##  [1] "PassengerId" "Survived"    "Pclass"      "Last_Name"   "Salutation" 
##  [6] "First_Name"  "Sex"         "Age"         "SibSp"       "Parch"      
## [11] "Ticket"      "Fare"        "Floor"       "Embarked"    "Room"

Convert variables to proper formats.

#convert 'Survived',Pclass' and 'Salutation' to factor and 'Cabin section' to integer
data <- data %>% mutate(Survived = as.factor(Survived), 
                        Pclass = as.factor(Pclass),
                        Floor = as.factor(Pclass),
                        Salutation = as.factor(Salutation),
                        Room = as.integer(Room),
                        Ticket = as.character(Ticket))

#for 'Pclass' assign the values  'Upper', 'Middle', 'Lower' to '1','2' and '3' respectevly.
data <- data %>% mutate(Pclass=factor(ifelse(Pclass=="1","Upper",ifelse(Pclass=="2","Middle",ifelse(Pclass=="3","Lower",Pclass)))))

#for 'Embarked' assign the values  'Cherbourg', 'Queenstown', 'Southampton' to 'C','Q' and 'S' respectevly.
data <- data %>% mutate(Embarked=factor(ifelse(Embarked=="C","Cherbourg",ifelse(Embarked=="Q","Queenstown",ifelse(Embarked=="S","Southampton",Embarked))))) 

#display first six records
head(data)
##   PassengerId Survived Pclass Last_Name Salutation
## 1           1        0  Lower    Braund         Mr
## 2           2        1  Upper   Cumings        Mrs
## 3           3        1  Lower Heikkinen       Miss
## 4           4        1  Upper  Futrelle        Mrs
## 5           5        0  Lower     Allen         Mr
## 6           6        0  Lower     Moran         Mr
##                              First_Name    Sex Age SibSp Parch
## 1                           Owen Harris   male  22     1     0
## 2 John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                                 Laina female  26     0     0
## 4         Jacques Heath (Lily May Peel) female  35     1     0
## 5                         William Henry   male  35     0     0
## 6                                 James   male  NA     0     0
##             Ticket    Fare Floor    Embarked Room
## 1        A/5 21171  7.2500     3 Southampton   NA
## 2         PC 17599 71.2833     1   Cherbourg   85
## 3 STON/O2. 3101282  7.9250     3 Southampton   NA
## 4           113803 53.1000     1 Southampton  123
## 5           373450  8.0500     3 Southampton   NA
## 6           330877  8.4583     3  Queenstown   NA

Analyze whether missing values in Cabin are predictive of survival outcome

#for 'Cabin floor' replace NAs with "NONE"
#data <- data %>% mutate(Floor = factor(ifelse(is.na(Floor),"NONE",Floor))) 

#for 'Cabin section' replace NAs with "NONE"
#data <- data %>% mutate(Room = as.integer(ifelse(is.na(Room),"0",Room)))

head(data)
##   PassengerId Survived Pclass Last_Name Salutation
## 1           1        0  Lower    Braund         Mr
## 2           2        1  Upper   Cumings        Mrs
## 3           3        1  Lower Heikkinen       Miss
## 4           4        1  Upper  Futrelle        Mrs
## 5           5        0  Lower     Allen         Mr
## 6           6        0  Lower     Moran         Mr
##                              First_Name    Sex Age SibSp Parch
## 1                           Owen Harris   male  22     1     0
## 2 John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                                 Laina female  26     0     0
## 4         Jacques Heath (Lily May Peel) female  35     1     0
## 5                         William Henry   male  35     0     0
## 6                                 James   male  NA     0     0
##             Ticket    Fare Floor    Embarked Room
## 1        A/5 21171  7.2500     3 Southampton   NA
## 2         PC 17599 71.2833     1   Cherbourg   85
## 3 STON/O2. 3101282  7.9250     3 Southampton   NA
## 4           113803 53.1000     1 Southampton  123
## 5           373450  8.0500     3 Southampton   NA
## 6           330877  8.4583     3  Queenstown   NA
#create mosaic plot for 'Cabin floor'
count <- table(data$Survived, data$Floor)
mosaicplot(count, main = "Distribution of 'Survived'",
           xlab = "Survived",
           ylab = "Cabin",
           las = 1,
           border = "black",
           shade = TRUE
           )

Apply multiple imputation.

#apply multiple imputation for training data set
#exclude variable 'PassengerId' and 'Survived'
exclude <- c('PassengerId')
data_no_missing_variables <- data %>% dplyr::select(-Age,-Embarked,-Floor,-Room)
include <- setdiff(names(data), exclude)
data_include <- data[include]

#imputation with mean
imp.data <- mice(data_include, m=20, method='pmm', printFlag=FALSE)

#merge imputed values with data frame
data <- mice::complete(imp.data)
data <- data.frame(data_no_missing_variables,data %>% dplyr::select(Age,Embarked,Floor,Room))
head(data)
##   PassengerId Survived Pclass Last_Name Salutation
## 1           1        0  Lower    Braund         Mr
## 2           2        1  Upper   Cumings        Mrs
## 3           3        1  Lower Heikkinen       Miss
## 4           4        1  Upper  Futrelle        Mrs
## 5           5        0  Lower     Allen         Mr
## 6           6        0  Lower     Moran         Mr
##                              First_Name    Sex SibSp Parch
## 1                           Owen Harris   male     1     0
## 2 John Bradley (Florence Briggs Thayer) female     1     0
## 3                                 Laina female     0     0
## 4         Jacques Heath (Lily May Peel) female     1     0
## 5                         William Henry   male     0     0
## 6                                 James   male     0     0
##             Ticket    Fare Age    Embarked Floor Room
## 1        A/5 21171  7.2500  22 Southampton     3   15
## 2         PC 17599 71.2833  38   Cherbourg     1   85
## 3 STON/O2. 3101282  7.9250  26 Southampton     3   35
## 4           113803 53.1000  35 Southampton     1  123
## 5           373450  8.0500  35 Southampton     3  124
## 6           330877  8.4583  36  Queenstown     3  101
#confirm no NAs
count_nas(data)
##    variable_name_column number_missing_column percentage
## 1              Survived                     0          0
## 2                Pclass                     0          0
## 3             Last_Name                     0          0
## 4            Salutation                     0          0
## 5            First_Name                     0          0
## 6                   Sex                     0          0
## 7                 SibSp                     0          0
## 8                 Parch                     0          0
## 9                Ticket                     0          0
## 10                 Fare                     0          0
## 11                  Age                     0          0
## 12             Embarked                     0          0
## 13                Floor                     0          0
## 14                 Room                     0          0
#write.csv(data, file = "/Users/olga/downloads/data_imputed.csv", row.names = FALSE)
data <- data %>% mutate(with_relatives=factor(ifelse(SibSp+Parch == 0, "Yes","No")))

Variables Analysis:

  1. Variable ‘Survived’.

It’s response variable that defines chance of survival. It belongs to binary type as it can take only “Yes” or “No”.

  1. Variable ‘Pclass’.
#mosaic plot
count <- table(data$Pclass, data$Survived)
mosaicplot(count, main = "Pclass vs Survived",
           xlab = "Pclass",
           ylab = "Survived",
           las = 1,
           border = "black",
           shade = TRUE
           )

  1. Variable ‘Sex’.
#mosaic plot
count <- table(data$Sex, data$Survived)
mosaicplot(count, main = "Sex vs Survived",
           xlab = "Sex",
           ylab = "Survived",
           las = 1,
           border = "black",
           shade = TRUE
           )

  1. Variable ‘Age’.

It defines age in years.

boxplot(data$Age)

par(mfrow=c(1,2))
plotNormalHistogram(data$Age,main="Age")
qqnorm(data$Age)
qqline(data$Age)

  1. ‘SibSp’ - number of siblings / spouses aboard the Titanic
boxplot(data$SibSp)

par(mfrow=c(1,2))
plotNormalHistogram(data$SibSp,main="SibSp")
qqnorm(data$SibSp)
qqline(data$SibSp)

# Change histogram plot fill colors by groups
ggplot(data, aes(x=Fare, fill=Survived, color=Survived)) +
  geom_histogram(position="identity")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  1. ‘Parch’ - number of parents / children aboard the Titanic
boxplot(data$Parch)

par(mfrow=c(1,2))
plotNormalHistogram(data$Parch,main="Parch")
qqnorm(data$Parch)
qqline(data$Parch)

  1. ‘Ticket’ - ticket number

  2. ‘Fare’ - passenger fare

boxplot(data$Fare)

par(mfrow=c(1,2))
plotNormalHistogram(data$Fare,main="Fare")
qqnorm(data$Fare)
qqline(data$Fare)

  1. ‘Cabin’ - cabin number

The variable cabin was split by two other variables - ‘Flooor’ and ‘Room’.

#floor
count <- table(data$Floor, data$Survived)
mosaicplot(count, main = "Floor vs Survived",
           xlab = "Floor",
           ylab = "Survived",
           las = 1,
           border = "black",
           shade = TRUE
           )

#room
boxplot(data$Room)

par(mfrow=c(1,2))
plotNormalHistogram(data$Room,main="Room")
qqnorm(data$Room)
qqline(data$Room)

  1. ‘Embarked’ - port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)
#mosaic plot
count <- table(data$Embarked, data$Survived)
mosaicplot(count, main = "Embarked vs Survived",
           xlab = "Embarked",
           ylab = "Survived",
           las = 1,
           border = "black",
           shade = TRUE
           )

  1. BUILD MODELS

The assumptions for the logistic regression are:

Assumption 1 - Logistic regression typically requires a large sample size. It requires a minimum of 10 cases with the least frequent outcome for each independent variable in the model. For example, if a model has 5 independent variables and the expected probability of the least frequent outcome is .10, then the minimum sample size is 500 (10*5 / .10).

dim(data)
## [1] 891  16

The data set that contains 891 observations can be considered as a large data set.

First, binary logistic regression requires the dependent variable to be binary.

levels(factor(data$Survived))
## [1] "0" "1"

Assumption 2 - Logistic regression requires the observations to be independent of each other. I assume that majority of the observations are independent since most of the passengers are not related.

Assumption 3 - Logistic regression requires there to be little or no multicollinearity among the independent variables. This means that the independent variables should not be too highly correlated with each other.

#correlation between variables
corrplot(cor(data[2:length(data)] %>% select_if(is.numeric)), type = "upper", method = "number", tl.cex = 0.5, tl.col="black",number.cex = .5)

Assumption 4 - Logistic regression assumes linearity of independent variables and log odds.

I tested linearity in the logit by running Box-Tidwell Transformation Test. I added to the logistic model interaction terms which are the crossproduct of each independent times its natural logarithm ( (X)ln(X)] ). If these terms are significant, then there is non-linearity in the logit.

#count negative values and 0s
count_neg(data)
##    variable_name_column number_negative_column percentage
## 1                Pclass                      0          0
## 2             Last_Name                      0          0
## 3            Salutation                      0          0
## 4            First_Name                      0          0
## 5                   Sex                      0          0
## 6                 SibSp                      0          0
## 7                 Parch                      0          0
## 8                Ticket                      0          0
## 9                  Fare                      0          0
## 10                  Age                      0          0
## 11             Embarked                      0          0
## 12                Floor                      0          0
## 13                 Room                      0          0
## 14       with_relatives                      0          0
count_nas(data)
##    variable_name_column number_missing_column percentage
## 1              Survived                     0          0
## 2                Pclass                     0          0
## 3             Last_Name                     0          0
## 4            Salutation                     0          0
## 5            First_Name                     0          0
## 6                   Sex                     0          0
## 7                 SibSp                     0          0
## 8                 Parch                     0          0
## 9                Ticket                     0          0
## 10                 Fare                     0          0
## 11                  Age                     0          0
## 12             Embarked                     0          0
## 13                Floor                     0          0
## 14                 Room                     0          0
## 15       with_relatives                     0          0
count_zeros(data)
##    variable_name_column number_zero_column percentage
## 1                 Parch                678      76.09
## 2                 SibSp                608      68.24
## 3                  Fare                 15       1.68
## 4                Pclass                  0       0.00
## 5             Last_Name                  0       0.00
## 6            Salutation                  0       0.00
## 7            First_Name                  0       0.00
## 8                   Sex                  0       0.00
## 9                Ticket                  0       0.00
## 10                  Age                  0       0.00
## 11             Embarked                  0       0.00
## 12                Floor                  0       0.00
## 13                 Room                  0       0.00
## 14       with_relatives                  0       0.00
#solve problem with 0s as Box Tidwell test doesn't accept 0s
#data <- data %>% mutate(Fare = Fare + 0.001,Room = Room + 0.001, Parch = Parch + 0.001, SibSp = SibSp + 0.001)
#create dataframe to store modified data
data_modified <- data

#resolve problem with non-negative values
for (i in 2:(ncol(data_modified)-1)){
  
  for (j in 1:nrow(data_modified)){
if (is.numeric(data_modified[j,i])==TRUE && data_modified[j,i] <= 0 ) {
  data_modified[j,i]=data_modified[j,i]+0.01}
  }
}

head(data_modified)
##   PassengerId Survived Pclass Last_Name Salutation
## 1           1        0  Lower    Braund         Mr
## 2           2        1  Upper   Cumings        Mrs
## 3           3        1  Lower Heikkinen       Miss
## 4           4        1  Upper  Futrelle        Mrs
## 5           5        0  Lower     Allen         Mr
## 6           6        0  Lower     Moran         Mr
##                              First_Name    Sex SibSp Parch
## 1                           Owen Harris   male  1.00  0.01
## 2 John Bradley (Florence Briggs Thayer) female  1.00  0.01
## 3                                 Laina female  0.01  0.01
## 4         Jacques Heath (Lily May Peel) female  1.00  0.01
## 5                         William Henry   male  0.01  0.01
## 6                                 James   male  0.01  0.01
##             Ticket    Fare Age    Embarked Floor Room with_relatives
## 1        A/5 21171  7.2500  22 Southampton     3   15             No
## 2         PC 17599 71.2833  38   Cherbourg     1   85             No
## 3 STON/O2. 3101282  7.9250  26 Southampton     3   35            Yes
## 4           113803 53.1000  35 Southampton     1  123             No
## 5           373450  8.0500  35 Southampton     3  124            Yes
## 6           330877  8.4583  36  Queenstown     3  101            Yes
#data transformation
for (i in 2:ncol(data_modified)) {
  

  
  if (colnames(data_modified[i])!="OverallQual" || colnames(data_modified[i])!="GarageCars"){
  
  if (is.numeric(data_modified[,i])==TRUE){
    
print(colnames(data_modified[i]))   
  
Box = boxcox(data_modified[,i] ~ 1,               # Transform Turbidity as a single vector
             lambda = seq(-10,10,1)      # Try values -10 to 10 by 0.1
             )
Cox = data.frame(Box$x, Box$y)            # Create a data frame with the results
Cox2 = Cox[with(Cox, order(-Cox$Box.y)),] # Order the new data frame by decreasing y
#Cox2[1,]                                  # Display the lambda with the greatest
                                          #    log likelihood
lambda = Cox2[1, "Box.x"]                 # Extract that lambda
print(lambda)
data_modified[,i] = (data_modified[,i] ^ lambda - 1)/lambda   # Transform the original data

#plotNormalHistogram(data[,i],main=names(data)[i])
#qqnorm(data[,i],main=names(data)[i])
#qqline(data[,i])
  }
  }

}
## [1] "SibSp"

## [1] -0.5050505
## [1] "Parch"

## [1] -0.7070707
## [1] "Fare"

## [1] 0.3030303
## [1] "Age"

## [1] 0.7070707
## [1] "Room"

## [1] 0.5050505
ggplot(data, aes(x=Fare, fill=Survived, color=Survived)) +
  geom_histogram(position="identity")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

par(mfrow=c(1,2))
ggplot(data, aes(x=Fare, fill=Survived, color=Survived)) +
  geom_histogram(position="identity")+
stat_function(fun = dnorm, args = list(mean = mean(data$Age), sd = sd(data$Age)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data_modified, aes(x=Fare, fill=Survived, color=Survived)) +
  geom_histogram(position="identity")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#remove variables First Name, Last Name and Ticket from dataset
data <- data %>% dplyr::select(-First_Name,-Last_Name,-Ticket,-PassengerId)
data_modified <- data_modified %>% dplyr::select(-First_Name,-Last_Name,-Ticket,-PassengerId)
  1. SELECT MODELS

Use stepwise approach to build the model.

The best three regression models were built based on the stepwise approach that analyses all combination of variables and selects the best regression model based on lowest AIC (Akaike’s criterion) value. Lower values of AIC indicate the preferred model, that is, the one with the fewest parameters that still provides an adequate fit to the data. The step function ignored redundant variables (the variables that are highly correlated).

#build glm model using stepwise approach
model.null = glm(Survived ~ 1, 
                 data = data_modified,
                 family = binomial(link="logit")
                 )

model.full = glm(Survived ~ .,
                 data = data_modified,
                 family = binomial(link="logit")
                 )
     
step(model.null,
     scope = list(upper=model.full),
             direction = "both",
             test = "Chisq",
             data = data_modified)
## Start:  AIC=1188.66
## Survived ~ 1
## 
##                  Df Deviance     AIC    LRT  Pr(>Chi)    
## + Salutation     16   869.38  903.38 317.28 < 2.2e-16 ***
## + Sex             1   917.80  921.80 268.85 < 2.2e-16 ***
## + Pclass          2  1083.11 1089.11 103.55 < 2.2e-16 ***
## + Floor           2  1083.11 1089.11 103.55 < 2.2e-16 ***
## + Fare            1  1089.52 1093.52  97.13 < 2.2e-16 ***
## + with_relatives  1  1149.96 1153.96  36.70 1.381e-09 ***
## + Embarked        2  1161.29 1167.29  25.36 3.107e-06 ***
## + Parch           1  1167.81 1171.81  18.84 1.421e-05 ***
## + SibSp           1  1175.63 1179.63  11.02 0.0009007 ***
## + Age             1  1176.87 1180.87   9.79 0.0017549 ** 
## + Room            1  1180.48 1184.48   6.17 0.0129727 *  
## <none>               1186.66 1188.66                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=903.38
## Survived ~ Salutation
## 
##                  Df Deviance     AIC    LRT  Pr(>Chi)    
## + Pclass          2   772.04  810.04  97.34 < 2.2e-16 ***
## + Floor           2   772.04  810.04  97.34 < 2.2e-16 ***
## + Fare            1   834.72  870.72  34.66 3.923e-09 ***
## + Embarked        2   851.63  889.63  17.75 0.0001398 ***
## + SibSp           1   863.49  899.49   5.88 0.0152746 *  
## + Parch           1   863.55  899.55   5.83 0.0157791 *  
## <none>                869.38  903.38                     
## + Sex             1   867.46  903.46   1.92 0.1655766    
## + with_relatives  1   867.88  903.88   1.50 0.2210173    
## + Room            1   868.17  904.17   1.20 0.2727701    
## + Age             1   868.86  904.86   0.52 0.4725165    
## - Salutation     16  1186.66 1188.66 317.28 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=810.04
## Survived ~ Salutation + Pclass
## 
##                  Df Deviance     AIC     LRT  Pr(>Chi)    
## + Age             1   759.76  799.76  12.277 0.0004585 ***
## + SibSp           1   763.93  803.93   8.107 0.0044098 ** 
## + Parch           1   764.31  804.31   7.726 0.0054431 ** 
## + Embarked        2   764.13  806.13   7.906 0.0192013 *  
## + with_relatives  1   766.49  806.49   5.548 0.0184992 *  
## <none>                772.04  810.04                      
## + Sex             1   770.44  810.44   1.600 0.2059046    
## + Room            1   771.12  811.12   0.919 0.3377892    
## + Fare            1   771.40  811.40   0.644 0.4222235    
## - Pclass          2   869.38  903.38  97.338 < 2.2e-16 ***
## - Salutation     16  1083.11 1089.11 311.068 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=799.76
## Survived ~ Salutation + Pclass + Age
## 
##                  Df Deviance     AIC     LRT  Pr(>Chi)    
## + Parch           1   744.52  786.52  15.243 9.451e-05 ***
## + SibSp           1   745.71  787.71  14.057 0.0001774 ***
## + with_relatives  1   746.89  788.89  12.869 0.0003341 ***
## + Embarked        2   752.53  796.53   7.237 0.0268183 *  
## + Fare            1   757.61  799.61   2.153 0.1423252    
## <none>                759.76  799.76                      
## + Sex             1   758.04  800.04   1.719 0.1898702    
## + Room            1   759.68  801.68   0.081 0.7757319    
## - Age             1   772.04  810.04  12.277 0.0004585 ***
## - Pclass          2   868.86  904.86 109.099 < 2.2e-16 ***
## - Salutation     16  1019.44 1027.44 259.678 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=786.52
## Survived ~ Salutation + Pclass + Age + Parch
## 
##                  Df Deviance     AIC     LRT  Pr(>Chi)    
## + SibSp           1   735.79  779.79   8.733  0.003124 ** 
## + Embarked        2   738.01  784.01   6.511  0.038559 *  
## + with_relatives  1   741.90  785.90   2.615  0.105875    
## <none>                744.52  786.52                      
## + Sex             1   742.77  786.77   1.751  0.185757    
## + Room            1   744.01  788.01   0.506  0.477087    
## + Fare            1   744.51  788.51   0.006  0.939742    
## - Parch           1   759.76  799.76  15.243 9.451e-05 ***
## - Age             1   764.31  804.31  19.794 8.623e-06 ***
## - Pclass          2   863.53  901.53 119.007 < 2.2e-16 ***
## - Salutation     16  1017.41 1027.41 272.895 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=779.79
## Survived ~ Salutation + Pclass + Age + Parch + SibSp
## 
##                  Df Deviance     AIC     LRT  Pr(>Chi)    
## + Embarked        2   729.89  777.89   5.896  0.052449 .  
## <none>                735.79  779.79                      
## + Sex             1   734.28  780.28   1.507  0.219551    
## + with_relatives  1   734.30  780.30   1.486  0.222835    
## + Fare            1   735.12  781.12   0.666  0.414442    
## + Room            1   735.39  781.39   0.393  0.530565    
## - SibSp           1   744.52  786.52   8.733  0.003124 ** 
## - Parch           1   745.71  787.71   9.920  0.001635 ** 
## - Age             1   759.59  801.59  23.802 1.068e-06 ***
## - Pclass          2   860.33  900.33 124.548 < 2.2e-16 ***
## - Salutation     16  1017.38 1029.38 281.592 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=777.89
## Survived ~ Salutation + Pclass + Age + Parch + SibSp + Embarked
## 
##                  Df Deviance     AIC     LRT  Pr(>Chi)    
## <none>                729.89  777.89                      
## + Sex             1   728.03  778.03   1.859  0.172693    
## + Room            1   729.15  779.15   0.744  0.388538    
## + with_relatives  1   729.21  779.21   0.682  0.409042    
## + Fare            1   729.65  779.65   0.240  0.624159    
## - Embarked        2   735.79  779.79   5.896  0.052449 .  
## - SibSp           1   738.01  784.01   8.118  0.004383 ** 
## - Parch           1   739.59  785.59   9.700  0.001843 ** 
## - Age             1   752.33  798.33  22.442 2.166e-06 ***
## - Pclass          2   840.88  884.88 110.993 < 2.2e-16 ***
## - Salutation     16   999.72 1015.72 269.832 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Call:  glm(formula = Survived ~ Salutation + Pclass + Age + Parch + 
##     SibSp + Embarked, family = binomial(link = "logit"), data = data_modified)
## 
## Coefficients:
##            (Intercept)           SalutationCol           SalutationDon  
##              -15.67628                14.39604                -3.10836  
##           SalutationDr      SalutationJonkheer          SalutationLady  
##               14.16814                -2.60442                30.96503  
##        SalutationMajor        SalutationMaster          SalutationMiss  
##               14.35208                16.21931                16.38599  
##         SalutationMlle           SalutationMme            SalutationMr  
##               29.36970                29.36970                13.70258  
##          SalutationMrs            SalutationMs           SalutationRev  
##               17.66501                31.48623                -0.94928  
##          SalutationSir  Salutationthe Countess            PclassMiddle  
##               31.00108                30.33017                 1.37437  
##            PclassUpper                     Age                   Parch  
##                2.73698                -0.11237                -0.02369  
##                  SibSp               Embarked2               Embarked3  
##               -0.03524                -0.23957                -0.58078  
## 
## Degrees of Freedom: 890 Total (i.e. Null);  867 Residual
## Null Deviance:       1187 
## Residual Deviance: 729.9     AIC: 777.9

Test Goodness of Fit.

Deviance analysis, Likelihood Ratio Test, Pseudo R^2 Test and Hosmer-Lemeshow Test were performed to test goodness of fit of the regression.

  1. Likelihood Ratio Test.

A logistic regression is said to provide a better fit to the data if it demonstrates an improvement over a model with fewer predictors. This is performed using the likelihood ratio test, which compares the likelihood of the data under the full model (Model #1) against the likelihood of the data under a model with fewer predictors (Model #3). Removing predictor variables from a model will almost always make the model fit less well, but it is necessary to test whether the observed difference in model fit is statistically significant.

The following hypothesis were stated:

The Null Hypothesis: reduced model is true. The Alternative Hypothesis: reduced model is false.

#final model
final.model <- glm(formula = Survived ~ Salutation + Pclass + SibSp + Age + 
    Parch + Embarked + Room, family = binomial(link = "logit"), data = data)

summary(final.model)
## 
## Call:
## glm(formula = Survived ~ Salutation + Pclass + SibSp + Age + 
##     Parch + Embarked + Room, family = binomial(link = "logit"), 
##     data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4786  -0.5452  -0.3381   0.5042   2.5561  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -1.481e+01  2.400e+03  -0.006   0.9951    
## SalutationCol           1.491e+01  2.400e+03   0.006   0.9950    
## SalutationDon          -2.688e+00  3.393e+03  -0.001   0.9994    
## SalutationDr            1.479e+01  2.400e+03   0.006   0.9951    
## SalutationJonkheer     -2.301e+00  3.393e+03  -0.001   0.9995    
## SalutationLady          3.142e+01  3.393e+03   0.009   0.9926    
## SalutationMajor         1.479e+01  2.400e+03   0.006   0.9951    
## SalutationMaster        1.723e+01  2.400e+03   0.007   0.9943    
## SalutationMiss          1.686e+01  2.400e+03   0.007   0.9944    
## SalutationMlle          2.986e+01  2.939e+03   0.010   0.9919    
## SalutationMme           2.981e+01  3.393e+03   0.009   0.9930    
## SalutationMr            1.407e+01  2.400e+03   0.006   0.9953    
## SalutationMrs           1.797e+01  2.400e+03   0.007   0.9940    
## SalutationMs            3.186e+01  3.393e+03   0.009   0.9925    
## SalutationRev          -5.747e-01  2.583e+03   0.000   0.9998    
## SalutationSir           3.147e+01  3.393e+03   0.009   0.9926    
## Salutationthe Countess  3.069e+01  3.393e+03   0.009   0.9928    
## PclassMiddle            1.175e+00  2.569e-01   4.575 4.75e-06 ***
## PclassUpper             2.674e+00  2.922e-01   9.152  < 2e-16 ***
## SibSp                  -6.056e-01  1.277e-01  -4.743 2.10e-06 ***
## Age                    -4.333e-02  9.268e-03  -4.675 2.94e-06 ***
## Parch                  -3.340e-01  1.352e-01  -2.469   0.0135 *  
## Embarked2              -9.415e-02  4.116e-01  -0.229   0.8191    
## Embarked3              -4.181e-01  2.545e-01  -1.643   0.1004    
## Room                   -1.893e-03  2.970e-03  -0.638   0.5238    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1186.66  on 890  degrees of freedom
## Residual deviance:  704.69  on 866  degrees of freedom
## AIC: 754.69
## 
## Number of Fisher Scoring iterations: 15
#reduced model with fewer parameters
model2 <- glm(formula = Survived ~ Salutation + Pclass + SibSp + Parch + Fare,
              family = binomial(link = "logit"), data = data)

model3 <- glm(formula = Survived ~ Salutation + Pclass + SibSp,
              family = binomial(link = "logit"), data = data)

#Likelihood Ratio Test
anova(final.model, model2, test ="Chisq")
## Analysis of Deviance Table
## 
## Model 1: Survived ~ Salutation + Pclass + SibSp + Age + Parch + Embarked + 
##     Room
## Model 2: Survived ~ Salutation + Pclass + SibSp + Parch + Fare
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       866     704.69                          
## 2       869     731.06 -3  -26.372 7.971e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(final.model, model3, test ="Chisq")
## Analysis of Deviance Table
## 
## Model 1: Survived ~ Salutation + Pclass + SibSp + Age + Parch + Embarked + 
##     Room
## Model 2: Survived ~ Salutation + Pclass + SibSp
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       866     704.69                          
## 2       871     739.90 -5  -35.211 1.366e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The results of Likelihood Ratio Test shows that null hypothesis is rejected (as p-value is less than significance level of 5%). It would provide evidence against the reduced model in favor of the final model.

  1. Deviance analysis.

The residual deviance, that shows how well the response is predicted by the model when the predictors are included, was used to test whether logistic regression model provides an adequate fit for the data or not.

The following hypothesis were stated:

The Null Hypothesis: logistic regression model provides an adequate fit for the data. The Alternative Hypothesis: logistic regression model doesn’t provide an adequate fit for the data.

#residual deviance test
p_value = 1 - pchisq(final.model$deviance,final.model$df.residual)
p_value
## [1] 0.9999819

As p-value is a little bit greater than the significance level of 5% the null hypothesis is accepted. It suggests that the fitted values are not significantly different from the observed values.

  1. Pseudo R^2 Test.

Unlike linear regression with ordinary least squares estimation, there is no R-squire statistic, which explains the proportion of variance in the dependent variable that is explained by the predictors. However, there are a number of pseudo R2-squire metrics that could be of value. Most notable is McFadden’s R-squire. It measures ranges from 0 to just less than 1, with values closer to zero indicating that the model has no predictive power.

#Pseudo R^2 Test
pR2(final.model)
##          llh      llhNull           G2     McFadden         r2ML 
## -352.3442170 -593.3275684  481.9667028    0.4061557    0.4177922 
##         r2CU 
##    0.5676488

The results of the McFadden’s R-squire test shows that the model has a quite strong predictive power since the value of r2CU is close to 1 (r2CU equals to 0.8139615).

  1. Hosmer-Lemeshow Test.

Another approach to determining the goodness of fit is through the Homer-Lemeshow statistics, which is computed on data after the observations have been segmented into groups based on having similar predicted probabilities. It examines whether the observed proportions of events are similar to the predicted probabilities of occurrence in subgroups of the data set using a Pearson chi square test.

The following hypothesis were stated:

The Null Hypothesis: there is a good fit of the model. The Alternative Hypothesis: there is a poor fit of the model.

#Hosmer-Lemeshow Test
hoslem.test(data$Survived, fitted(final.model), g=10)
## Warning in Ops.factor(1, y): '-' not meaningful for factors
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  data$Survived, fitted(final.model)
## X-squared = 891, df = 8, p-value < 2.2e-16

Since p-vale is than significance level of 5% the null hypothesis is accepted. Hosmer-Lemeshow Test proves a good fit to the model.

Create a dummy variables

#create a dummy variables for Pclass="Middle"
data$Pclass_Middle <- ifelse(data$Pclass == "Middle",1,0)
data$Pclass_Upper <- ifelse(data$Pclass == "Upper",1,0)
data$Pclass_Middle <- ifelse(data$Pclass == "Middle",1,0)
data$Pclass_Upper <- ifelse(data$Pclass == "Upper",1,0)

data$Salutation_Col <- ifelse(data$Salutation  == "Col",1,0)
data$Salutation_Don <- ifelse(data$Salutation  == "Don",1,0)
data$Salutation_Dr <- ifelse(data$Salutation  == "Dr",1,0)
data$Salutation_Jonkheer  <- ifelse(data$Salutation  == "Jonkheer",1,0)
data$Salutation_Lady  <- ifelse(data$Salutation  == "Lady",1,0)
data$Salutation_Major   <- ifelse(data$Salutation  == "Major",1,0)
   
data$Salutation_Master  <- ifelse(data$Salutation  == "Master",1,0)
   
data$Salutation_Miss  <- ifelse(data$Salutation  == "Miss",1,0)
   
data$Salutation_Mlle  <- ifelse(data$Salutation  == "Mlle",1,0)
data$Salutation_Mme  <- ifelse(data$Salutation  == "Mme",1,0)   
data$Salutation_Mr  <- ifelse(data$Salutation  == "Mr",1,0)
data$Salutation_Mrs  <- ifelse(data$Salutation  == "Mrs",1,0)
data$Salutation_Ms  <- ifelse(data$Salutation  == "Ms",1,0)
data$Salutation_Rev  <- ifelse(data$Salutation  == "Rev",1,0)
data$Salutation_Sir  <- ifelse(data$Salutation  == "Sir",1,0)
data$Salutation_theCountess  <- ifelse(data$Salutation  == "the Countess",1,0)

Predict ‘Survived’ class for testing and training data sets.

#create a new variable 'probability'
data$probability <- c()
data_testing$probability <- c()

#calculate logit function
logit_p <-1.170e+00*data$Pclass_Middle+2.523e+00*data$Pclass_Upper-5.814e-01*data$SibSp-3.722e-02*data$Age-3.206e-01*data$Parch



#logit_p_testing <- 0.8597*data$Pclass_Middle + 1.5757*data$Pclass_Upper -0.6767*data$SibSp - 0.4269*data$Parch + 0.5131*data$Fare+ 2.0799*data$Age

#calculate probability
data$probability <- exp(1)^logit_p/(1+exp(1)^logit_p)
#data_testing$probability <- exp(1)^logit_p_testing/(1+exp(1)^logit_p_testing)
head(data)
##   Survived Pclass Salutation    Sex SibSp Parch    Fare Age    Embarked
## 1        0  Lower         Mr   male     1     0  7.2500  22 Southampton
## 2        1  Upper        Mrs female     1     0 71.2833  38   Cherbourg
## 3        1  Lower       Miss female     0     0  7.9250  26 Southampton
## 4        1  Upper        Mrs female     1     0 53.1000  35 Southampton
## 5        0  Lower         Mr   male     0     0  8.0500  35 Southampton
## 6        0  Lower         Mr   male     0     0  8.4583  36  Queenstown
##   Floor Room with_relatives Pclass_Middle Pclass_Upper Salutation_Col
## 1     3   15             No             0            0              0
## 2     1   85             No             0            1              0
## 3     3   35            Yes             0            0              0
## 4     1  123             No             0            1              0
## 5     3  124            Yes             0            0              0
## 6     3  101            Yes             0            0              0
##   Salutation_Don Salutation_Dr Salutation_Jonkheer Salutation_Lady
## 1              0             0                   0               0
## 2              0             0                   0               0
## 3              0             0                   0               0
## 4              0             0                   0               0
## 5              0             0                   0               0
## 6              0             0                   0               0
##   Salutation_Major Salutation_Master Salutation_Miss Salutation_Mlle
## 1                0                 0               0               0
## 2                0                 0               0               0
## 3                0                 0               1               0
## 4                0                 0               0               0
## 5                0                 0               0               0
## 6                0                 0               0               0
##   Salutation_Mme Salutation_Mr Salutation_Mrs Salutation_Ms Salutation_Rev
## 1              0             1              0             0              0
## 2              0             0              1             0              0
## 3              0             0              0             0              0
## 4              0             0              1             0              0
## 5              0             1              0             0              0
## 6              0             1              0             0              0
##   Salutation_Sir Salutation_theCountess probability
## 1              0                      0   0.1977780
## 2              0                      0   0.6288392
## 3              0                      0   0.2753352
## 4              0                      0   0.6545048
## 5              0                      0   0.2137110
## 6              0                      0   0.2075232
#create a new variable that specifies predicted class
#data_testing$target_pred <-c()
head(data_testing)
##   PassengerId Pclass                                         Name    Sex
## 1         892      3                             Kelly, Mr. James   male
## 2         893      3             Wilkes, Mrs. James (Ellen Needs) female
## 3         894      2                    Myles, Mr. Thomas Francis   male
## 4         895      3                             Wirz, Mr. Albert   male
## 5         896      3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female
## 6         897      3                   Svensson, Mr. Johan Cervin   male
##    Age SibSp Parch  Ticket    Fare Cabin Embarked
## 1 34.5     0     0  330911  7.8292  <NA>        Q
## 2 47.0     1     0  363272  7.0000  <NA>        S
## 3 62.0     0     0  240276  9.6875  <NA>        Q
## 4 27.0     0     0  315154  8.6625  <NA>        S
## 5 22.0     1     1 3101298 12.2875  <NA>        S
## 6 14.0     0     0    7538  9.2250  <NA>        S
#calculate probability
data = within(data, {
    Survived_pred = ifelse(data$probability < 0.5, 0, 1)
 })

#data_testing = within(data_testing, {
#    target_pred = ifelse(data_testing$probability < 0.5, 0, 1)
# })

#head(data_testing)

Calculate Classification Metrics.

The table below displays values of classification metrics that were generated by custom functions discussed in the previous sections.

#create confusion matrix
confusion_matrix <- table("Predicted" = data$Survived_pred, "Actual" = data$Survived)
confusion_matrix
##          Actual
## Predicted   0   1
##         0 443 191
##         1 106 151
#calculate true positive
TP <- confusion_matrix[4]

#calculate true negative
TN <- confusion_matrix[1]

#calculate false negative
FN <- confusion_matrix[2]

#calculate false positive
FP <- confusion_matrix[3]

#calculate accuracy
accuracy <- (confusion_matrix[1,1] + confusion_matrix[2,2])/nrow(data)
accuracy
## [1] 0.6666667
#calculate accuracy classification error rate
classification_error_rate = (FP + FN)/(TP + FP + TN + FN)
classification_error_rate
## [1] 0.3333333
#calculate precision
precision = TP/(TP + FP)
precision
## [1] 0.4415205
#calculate sensitivity
sensitivity = TP/(TP + FN)
sensitivity
## [1] 0.5875486
#calculate specificity
specificity <- TN/(TN + FP)
specificity
## [1] 0.6987382
#calculate F1 score
F1_score <- (2*precision*sensitivity)/(precision + sensitivity)
F1_score
## [1] 0.5041736
roc.val <- roc(Survived~probability, data)
plot(roc.val, main="ROC plot") 

roc.val$auc
## Area under the curve: 0.709

Run different classifiers.

  1. J48
#create vectors to store accuracies values and errors values

classifier_name <-c()
Accuracy_mean <- c()
Accuracy_upper <- c()
Accuracy_lower <- c()
Kappa <- c()
MAE <- c()
RMSE <- c()
RAE <- c()
RRSE <- c()

classifier_name[1] <-c("J48")

#create empty vectors to store accuracies values and errors values for a certain classifier 
pctCorrect_vector <- c()
Kappa_vector <- c()
MAE_vector <- c()
RMSE_vector <- c()
RAE_vector <- c()
RRSE_vector <- c()

for (i in 1:50){
  
    #set the seed of R‘s random number generator
    set.seed(i)
    
    resultJ48 <- J48(Survived ~., data)
    evaluation <- evaluate_Weka_classifier(resultJ48)$details
    pctCorrect_value <- evaluation["pctCorrect"]
    Kappa_value <- evaluation["kappa"]
    MAE_value <- evaluation["meanAbsoluteError"]
    RMSE_value <- evaluation["rootMeanSquaredError"]
    RAE_value <- evaluation["rootMeanSquaredError"]
    RRSE_value <- evaluation["rootRelativeSquaredError"]
    
    #add the value generated after each iteration to the vector 
    pctCorrect_vector <- c(pctCorrect_vector,pctCorrect_value)
    Kappa_vector <- c(Kappa_vector, Kappa_value)
    MAE_vector <- c(MAE_vector, MAE_value)
    RMSE_vector <- c(RMSE_vector, RMSE_value)
    RAE_vector <- c(RAE_vector, RAE_value)
    RRSE_vector <- c(RRSE_vector, RRSE_value)
    
}

#calculate mean and standard deviation of accuracies and all errors
Accuracy_mean[1] <- mean(pctCorrect_vector)
pctCorrect_sd <- sd(pctCorrect_vector)
Accuracy_upper[1] <-Accuracy_mean[1] + pctCorrect_sd
Accuracy_lower <-Accuracy_mean[1] - pctCorrect_sd
Kappa[1] <- mean(Kappa_vector)
MAE[1] <- mean(MAE_vector)
RMSE[1] <- mean(RMSE_vector)
RAE[1] <- mean(RAE_vector)
RRSE[1] <- mean(RRSE_vector)
  1. Naive Bayes
classifier_name[2] <-c("Naive Bayes")

#create empty vectors to store accuracies values and errors values
pctCorrect_vector <- c()
Kappa_vector <- c()
MAE_vector <- c()
RMSE_vector <- c()
RAE_vector <- c()
RRSE_vector <- c()
for (i in 1:50){
    set.seed(i)
    #create 66% training data set 
   
    NB <- make_Weka_classifier("weka/classifiers/bayes/NaiveBayes")
    result_NaiveBayes <- NB(Survived ~., data)
    evaluation <- evaluate_Weka_classifier(result_NaiveBayes)$details
    pctCorrect_value <- evaluation["pctCorrect"]
    Kappa_value <- evaluation["kappa"]
    MAE_value <- evaluation["meanAbsoluteError"]
    RMSE_value <- evaluation["rootMeanSquaredError"]
    RAE_value <- evaluation["rootMeanSquaredError"]
    RRSE_value <- evaluation["rootRelativeSquaredError"]
    
    #add the value generated after each iteration to the vector 
    pctCorrect_vector <- c(pctCorrect_vector,pctCorrect_value)
    Kappa_vector <- c(Kappa_vector, Kappa_value)
    MAE_vector <- c(MAE_vector, MAE_value)
    RMSE_vector <- c(RMSE_vector, RMSE_value)
    RAE_vector <- c(RAE_vector, RAE_value)
    RRSE_vector <- c(RRSE_vector, RRSE_value)
}

#calculate mean and standard deviation of accuracies and all errors
Accuracy_mean[2] <- mean(pctCorrect_vector)
pctCorrect_sd[2] <- sd(pctCorrect_vector)
Accuracy_upper[2] <-Accuracy_mean[2] + pctCorrect_sd[2]
Accuracy_lower[2] <-Accuracy_mean[2] - pctCorrect_sd[2]
Kappa[2] <- mean(Kappa_vector)
MAE[2] <- mean(MAE_vector)
RMSE[2] <- mean(RMSE_vector)
RAE[2] <- mean(RAE_vector)
RRSE[2] <- mean(RRSE_vector)
  1. Knn
classifier_name[3] <-c("Knn")

#ignore the loop for this classifier since it loads very slow
#for (i in 1:50){
    #set.seed(i)
   
    knn <- IBk(Survived ~., data)
    evaluation <- evaluate_Weka_classifier(knn)$details
    pctCorrect_value <- evaluation["pctCorrect"]
    Kappa_value <- evaluation["Kappa"]
    MAE_value <- evaluation["meanAbsoluteError"]
    RMSE_value <- evaluation["rootMeanSquaredError"]
    RAE_value <- evaluation["rootMeanSquaredError"]
    RRSE_value <- evaluation["rootRelativeSquaredError"]
   
#}

#calculate mean and standard deviation of accuracies and all errors
Accuracy_mean[3] <- mean(pctCorrect_value)
pctCorrect_sd[3] <- sd(pctCorrect_value)
Accuracy_upper[3] <- ""
Accuracy_lower[3] <- ""
Kappa[3] <- mean(Kappa_value)
MAE[3] <- mean(MAE_value)
RMSE[3] <- mean(RMSE_value)
RAE[3] <- mean(RAE_value)
RRSE[3] <- mean(RRSE_value)

#confusion Matrix
evaluate_Weka_classifier(knn)$confusionMatrix
##    predicted
##       0   1
##   0 549   0
##   1   0 342
#ROC
plot(roc(data$Survived, predict(knn, data, type = "prob")[,2]))

  1. SVM
classifier_name[4] <-c("SVM")

#ignore the loop for this classifier since it loads very slow
#for (i in 1:50){
    #set.seed(i)
    #create 66% training data set 
    
    SVM <- SMO(Survived ~., data)
    evaluation <- evaluate_Weka_classifier(SVM)$details
    pctCorrect_value <- evaluation["pctCorrect"]
    Kappa_value <- evaluation["kappa"]
    MAE_value <- evaluation["meanAbsoluteError"]
    RMSE_value <- evaluation["rootMeanSquaredError"]
    RAE_value <- evaluation["rootMeanSquaredError"]
    RRSE_value <- evaluation["rootRelativeSquaredError"]
#}

    #calculate mean and standard deviation of accuracies and all errors
Accuracy_mean[4] <- mean(pctCorrect_value)
#pctCorrect_sd[4] <- sd(pctCorrect_value)
Accuracy_upper[4] <- ""
Accuracy_lower[4] <- ""
Kappa[4] <- mean(Kappa_value)
MAE[4] <- mean(MAE_value)
RMSE[4] <- mean(RMSE_value)
RAE[4] <- mean(RAE_value)
RRSE[4] <- mean(RRSE_value)    
data_table <- data.frame(classifier_name, Accuracy_mean, Accuracy_upper, Accuracy_lower, Kappa, MAE, RMSE, RAE, RRSE)
data_table
##   classifier_name Accuracy_mean   Accuracy_upper   Accuracy_lower
## 1             J48      88.21549 88.2154882154882 88.2154882154882
## 2     Naive Bayes      80.02245 80.0224466891134 80.0224466891134
## 3             Knn     100.00000                                  
## 4             SVM      83.27722                                  
##       Kappa         MAE        RMSE         RAE       RRSE
## 1 0.7445255 0.195435065 0.312598036 0.312598036 64.2783414
## 2 0.5863186 0.205613384 0.395207490 0.395207490 81.2650081
## 3        NA 0.001098478 0.001103904 0.001103904  0.2269916
## 4 0.6411211 0.167227834 0.408935000 0.408935000 84.0877436
#rename accuracy_mean column and divide by 100
#converting wide data format to long data

data_table <- data_table %>% dplyr::rename(Accuracy = Accuracy_mean) %>% dplyr::mutate(Accuracy = Accuracy/100) %>% dplyr::select(-Accuracy_upper,-Accuracy_lower,-Kappa,-RRSE) 
data_table
##   classifier_name  Accuracy         MAE        RMSE         RAE
## 1             J48 0.8821549 0.195435065 0.312598036 0.312598036
## 2     Naive Bayes 0.8002245 0.205613384 0.395207490 0.395207490
## 3             Knn 1.0000000 0.001098478 0.001103904 0.001103904
## 4             SVM 0.8327722 0.167227834 0.408935000 0.408935000
data1 <- data_table %>% tidyr::gather(accuracy_error,value,colnames(data_table)[2]:colnames(data_table)[5]) 
ggplot(data1, aes(factor(accuracy_error), value, fill = classifier_name)) + geom_bar(stat="identity", position = "dodge") 

#+ scale_fill_brewer(palette = "Set1")