Link to the project in RPubs: http://rpubs.com/ofomicheva86/390683
#required packages
library(knitr)
library(rcompanion)
library(corrplot)
library(PerformanceAnalytics)
library(GGally)
library(RColorBrewer)
library(VIM)
library(dplyr)
library(tidyr)
library(mice)
library(pROC)
library(caret)
library(pscl)
library(ResourceSelection)
library(car)
library(speedglm)
library(gdata)
library(ggplot2)
library(RWeka)
1.DATA EXPLORATION
The dataset contains the variables described below:
Read training and testing datasets
#read training data set
#replace blanks with NAs
data <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/titanic_train.csv",
header=T, na.strings=c("","NA"))
#read testing data set
data_testing <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/titanic_test.csv",
header=T, na.strings=c("","NA"))
#display first six entries
head(data)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp
## 1 Braund, Mr. Owen Harris male 22 1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
## 3 Heikkinen, Miss. Laina female 26 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
## 5 Allen, Mr. William Henry male 35 0
## 6 Moran, Mr. James male NA 0
## Parch Ticket Fare Cabin Embarked
## 1 0 A/5 21171 7.2500 <NA> S
## 2 0 PC 17599 71.2833 C85 C
## 3 0 STON/O2. 3101282 7.9250 <NA> S
## 4 0 113803 53.1000 C123 S
## 5 0 373450 8.0500 <NA> S
## 6 0 330877 8.4583 <NA> Q
#find dimentions
dim(data)
## [1] 891 12
levels(factor(data$Parch))
## [1] "0" "1" "2" "3" "4" "5" "6"
Count missing values.
#chart for missing values
aggr(data[-1], prop = T, numbers = T, cex.axis=.8,
ylab=c("Proportion of missingness","Missingness Pattern"),
labels=names(data[-1]))
#build function that counts missing values
count_nas <- function(data){
variable_name_column <- c()
number_missing_column <- c()
for (i in 2:ncol(data)){
variable_name <- colnames(data[i])
number_missing <- sum(is.na(data[i]))
variable_name_column <- c(variable_name_column,variable_name)
number_missing_column <- c(number_missing_column,number_missing)
}
missing_table <- data.frame(variable_name_column,number_missing_column)
missing_table <- missing_table %>% mutate(percentage=round(number_missing_column*100/nrow(data),2)) %>% arrange(desc(percentage))
missing_table
}
#build function that counts negative values
count_neg <- function(data){
variable_name_column <- c()
number_negative_column <- c()
for (i in 3:ncol(data)){
neg_count <- 0
variable_name <- colnames(data[i])
for (j in 1:nrow(data)){
if(is.numeric(data[j,i]) && !is.na(data[j,i]) && data[j,i] < 0) {
neg_count <- neg_count + 1
}
}
number_negative_column <- c(number_negative_column,neg_count)
variable_name_column <- c(variable_name_column,variable_name)
}
negative_table <- data.frame(variable_name_column,number_negative_column)
negative_table <- negative_table %>% mutate(percentage=round(number_negative_column*100/nrow(data),2)) %>% arrange(desc(percentage))
negative_table
}
#build function that counts 0s
count_zeros <- function(data){
variable_name_column <- c()
number_zero_column <- c()
for (i in 3:ncol(data)){
zero_count <- 0
variable_name <- colnames(data[i])
for (j in 1:nrow(data)){
if(is.numeric(data[j,i]) && !is.na(data[j,i]) && data[j,i] == 0) {
zero_count <- zero_count + 1
}
}
number_zero_column <- c(number_zero_column,zero_count)
variable_name_column <- c(variable_name_column,variable_name)
}
zero_table <- data.frame(variable_name_column,number_zero_column)
zero_table <- zero_table %>% mutate(percentage=round(number_zero_column*100/nrow(data),2)) %>% arrange(desc(percentage))
zero_table
}
#count NAs
count_nas(data)
## Warning: package 'bindrcpp' was built under R version 3.4.4
## variable_name_column number_missing_column percentage
## 1 Cabin 687 77.10
## 2 Age 177 19.87
## 3 Embarked 2 0.22
## 4 Survived 0 0.00
## 5 Pclass 0 0.00
## 6 Name 0 0.00
## 7 Sex 0 0.00
## 8 SibSp 0 0.00
## 9 Parch 0 0.00
## 10 Ticket 0 0.00
## 11 Fare 0 0.00
Create additional variables based on existing variables.
#split the variable 'Name' into 'First_Name', 'Last Name' and 'Salutation'
data <- data %>% separate(Name, c("Last_Name", "name"), sep = "\\,\\s+", na.rm=T) %>% separate(name, c("Salutation", "First_Name"), sep = "\\.\\s+", na.rm=T)
## Warning: Expected 2 pieces. Additional pieces discarded in 1 rows [514].
#split the variable 'Cabin' into 'Floor' and "Room"
data <- data %>% mutate(Cabin2=Cabin) %>% separate(Cabin, c("Floor", "Cabin"), sep = "[:digit:]", na.rm=T) %>% separate(Cabin2, c("Cabin2", "Room"), sep = "[:alpha:]", na.rm=T) %>% dplyr::select(-Cabin, -Cabin2)
## Warning: Expected 2 pieces. Additional pieces discarded in 176 rows [2,
## 4, 7, 12, 22, 28, 32, 53, 55, 56, 62, 63, 67, 76, 89, 93, 98, 103, 111,
## 119, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 4 rows [293,
## 328, 340, 474].
## Warning: Expected 2 pieces. Additional pieces discarded in 24 rows [28, 76,
## 89, 98, 119, 129, 298, 300, 306, 312, 342, 391, 436, 439, 499, 680, 700,
## 701, 716, 743, ...].
head(data)
## PassengerId Survived Pclass Last_Name Salutation
## 1 1 0 3 Braund Mr
## 2 2 1 1 Cumings Mrs
## 3 3 1 3 Heikkinen Miss
## 4 4 1 1 Futrelle Mrs
## 5 5 0 3 Allen Mr
## 6 6 0 3 Moran Mr
## First_Name Sex Age SibSp Parch
## 1 Owen Harris male 22 1 0
## 2 John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Laina female 26 0 0
## 4 Jacques Heath (Lily May Peel) female 35 1 0
## 5 William Henry male 35 0 0
## 6 James male NA 0 0
## Ticket Fare Floor Embarked Room
## 1 A/5 21171 7.2500 <NA> S <NA>
## 2 PC 17599 71.2833 C C 85
## 3 STON/O2. 3101282 7.9250 <NA> S <NA>
## 4 113803 53.1000 C S 123
## 5 373450 8.0500 <NA> S <NA>
## 6 330877 8.4583 <NA> Q <NA>
colnames(data)
## [1] "PassengerId" "Survived" "Pclass" "Last_Name" "Salutation"
## [6] "First_Name" "Sex" "Age" "SibSp" "Parch"
## [11] "Ticket" "Fare" "Floor" "Embarked" "Room"
Convert variables to proper formats.
#convert 'Survived',Pclass' and 'Salutation' to factor and 'Cabin section' to integer
data <- data %>% mutate(Survived = as.factor(Survived),
Pclass = as.factor(Pclass),
Floor = as.factor(Pclass),
Salutation = as.factor(Salutation),
Room = as.integer(Room),
Ticket = as.character(Ticket))
#for 'Pclass' assign the values 'Upper', 'Middle', 'Lower' to '1','2' and '3' respectevly.
data <- data %>% mutate(Pclass=factor(ifelse(Pclass=="1","Upper",ifelse(Pclass=="2","Middle",ifelse(Pclass=="3","Lower",Pclass)))))
#for 'Embarked' assign the values 'Cherbourg', 'Queenstown', 'Southampton' to 'C','Q' and 'S' respectevly.
data <- data %>% mutate(Embarked=factor(ifelse(Embarked=="C","Cherbourg",ifelse(Embarked=="Q","Queenstown",ifelse(Embarked=="S","Southampton",Embarked)))))
#display first six records
head(data)
## PassengerId Survived Pclass Last_Name Salutation
## 1 1 0 Lower Braund Mr
## 2 2 1 Upper Cumings Mrs
## 3 3 1 Lower Heikkinen Miss
## 4 4 1 Upper Futrelle Mrs
## 5 5 0 Lower Allen Mr
## 6 6 0 Lower Moran Mr
## First_Name Sex Age SibSp Parch
## 1 Owen Harris male 22 1 0
## 2 John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Laina female 26 0 0
## 4 Jacques Heath (Lily May Peel) female 35 1 0
## 5 William Henry male 35 0 0
## 6 James male NA 0 0
## Ticket Fare Floor Embarked Room
## 1 A/5 21171 7.2500 3 Southampton NA
## 2 PC 17599 71.2833 1 Cherbourg 85
## 3 STON/O2. 3101282 7.9250 3 Southampton NA
## 4 113803 53.1000 1 Southampton 123
## 5 373450 8.0500 3 Southampton NA
## 6 330877 8.4583 3 Queenstown NA
Analyze whether missing values in Cabin are predictive of survival outcome
#for 'Cabin floor' replace NAs with "NONE"
#data <- data %>% mutate(Floor = factor(ifelse(is.na(Floor),"NONE",Floor)))
#for 'Cabin section' replace NAs with "NONE"
#data <- data %>% mutate(Room = as.integer(ifelse(is.na(Room),"0",Room)))
head(data)
## PassengerId Survived Pclass Last_Name Salutation
## 1 1 0 Lower Braund Mr
## 2 2 1 Upper Cumings Mrs
## 3 3 1 Lower Heikkinen Miss
## 4 4 1 Upper Futrelle Mrs
## 5 5 0 Lower Allen Mr
## 6 6 0 Lower Moran Mr
## First_Name Sex Age SibSp Parch
## 1 Owen Harris male 22 1 0
## 2 John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Laina female 26 0 0
## 4 Jacques Heath (Lily May Peel) female 35 1 0
## 5 William Henry male 35 0 0
## 6 James male NA 0 0
## Ticket Fare Floor Embarked Room
## 1 A/5 21171 7.2500 3 Southampton NA
## 2 PC 17599 71.2833 1 Cherbourg 85
## 3 STON/O2. 3101282 7.9250 3 Southampton NA
## 4 113803 53.1000 1 Southampton 123
## 5 373450 8.0500 3 Southampton NA
## 6 330877 8.4583 3 Queenstown NA
#create mosaic plot for 'Cabin floor'
count <- table(data$Survived, data$Floor)
mosaicplot(count, main = "Distribution of 'Survived'",
xlab = "Survived",
ylab = "Cabin",
las = 1,
border = "black",
shade = TRUE
)
Apply multiple imputation.
#apply multiple imputation for training data set
#exclude variable 'PassengerId' and 'Survived'
exclude <- c('PassengerId')
data_no_missing_variables <- data %>% dplyr::select(-Age,-Embarked,-Floor,-Room)
include <- setdiff(names(data), exclude)
data_include <- data[include]
#imputation with mean
imp.data <- mice(data_include, m=20, method='pmm', printFlag=FALSE)
#merge imputed values with data frame
data <- mice::complete(imp.data)
data <- data.frame(data_no_missing_variables,data %>% dplyr::select(Age,Embarked,Floor,Room))
head(data)
## PassengerId Survived Pclass Last_Name Salutation
## 1 1 0 Lower Braund Mr
## 2 2 1 Upper Cumings Mrs
## 3 3 1 Lower Heikkinen Miss
## 4 4 1 Upper Futrelle Mrs
## 5 5 0 Lower Allen Mr
## 6 6 0 Lower Moran Mr
## First_Name Sex SibSp Parch
## 1 Owen Harris male 1 0
## 2 John Bradley (Florence Briggs Thayer) female 1 0
## 3 Laina female 0 0
## 4 Jacques Heath (Lily May Peel) female 1 0
## 5 William Henry male 0 0
## 6 James male 0 0
## Ticket Fare Age Embarked Floor Room
## 1 A/5 21171 7.2500 22 Southampton 3 15
## 2 PC 17599 71.2833 38 Cherbourg 1 85
## 3 STON/O2. 3101282 7.9250 26 Southampton 3 35
## 4 113803 53.1000 35 Southampton 1 123
## 5 373450 8.0500 35 Southampton 3 124
## 6 330877 8.4583 36 Queenstown 3 101
#confirm no NAs
count_nas(data)
## variable_name_column number_missing_column percentage
## 1 Survived 0 0
## 2 Pclass 0 0
## 3 Last_Name 0 0
## 4 Salutation 0 0
## 5 First_Name 0 0
## 6 Sex 0 0
## 7 SibSp 0 0
## 8 Parch 0 0
## 9 Ticket 0 0
## 10 Fare 0 0
## 11 Age 0 0
## 12 Embarked 0 0
## 13 Floor 0 0
## 14 Room 0 0
#write.csv(data, file = "/Users/olga/downloads/data_imputed.csv", row.names = FALSE)
data <- data %>% mutate(with_relatives=factor(ifelse(SibSp+Parch == 0, "Yes","No")))
Variables Analysis:
It’s response variable that defines chance of survival. It belongs to binary type as it can take only “Yes” or “No”.
#mosaic plot
count <- table(data$Pclass, data$Survived)
mosaicplot(count, main = "Pclass vs Survived",
xlab = "Pclass",
ylab = "Survived",
las = 1,
border = "black",
shade = TRUE
)
#mosaic plot
count <- table(data$Sex, data$Survived)
mosaicplot(count, main = "Sex vs Survived",
xlab = "Sex",
ylab = "Survived",
las = 1,
border = "black",
shade = TRUE
)
It defines age in years.
boxplot(data$Age)
par(mfrow=c(1,2))
plotNormalHistogram(data$Age,main="Age")
qqnorm(data$Age)
qqline(data$Age)
boxplot(data$SibSp)
par(mfrow=c(1,2))
plotNormalHistogram(data$SibSp,main="SibSp")
qqnorm(data$SibSp)
qqline(data$SibSp)
# Change histogram plot fill colors by groups
ggplot(data, aes(x=Fare, fill=Survived, color=Survived)) +
geom_histogram(position="identity")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
boxplot(data$Parch)
par(mfrow=c(1,2))
plotNormalHistogram(data$Parch,main="Parch")
qqnorm(data$Parch)
qqline(data$Parch)
‘Ticket’ - ticket number
‘Fare’ - passenger fare
boxplot(data$Fare)
par(mfrow=c(1,2))
plotNormalHistogram(data$Fare,main="Fare")
qqnorm(data$Fare)
qqline(data$Fare)
The variable cabin was split by two other variables - ‘Flooor’ and ‘Room’.
#floor
count <- table(data$Floor, data$Survived)
mosaicplot(count, main = "Floor vs Survived",
xlab = "Floor",
ylab = "Survived",
las = 1,
border = "black",
shade = TRUE
)
#room
boxplot(data$Room)
par(mfrow=c(1,2))
plotNormalHistogram(data$Room,main="Room")
qqnorm(data$Room)
qqline(data$Room)
#mosaic plot
count <- table(data$Embarked, data$Survived)
mosaicplot(count, main = "Embarked vs Survived",
xlab = "Embarked",
ylab = "Survived",
las = 1,
border = "black",
shade = TRUE
)
The assumptions for the logistic regression are:
Assumption 1 - Logistic regression typically requires a large sample size. It requires a minimum of 10 cases with the least frequent outcome for each independent variable in the model. For example, if a model has 5 independent variables and the expected probability of the least frequent outcome is .10, then the minimum sample size is 500 (10*5 / .10).
dim(data)
## [1] 891 16
The data set that contains 891 observations can be considered as a large data set.
First, binary logistic regression requires the dependent variable to be binary.
levels(factor(data$Survived))
## [1] "0" "1"
Assumption 2 - Logistic regression requires the observations to be independent of each other. I assume that majority of the observations are independent since most of the passengers are not related.
Assumption 3 - Logistic regression requires there to be little or no multicollinearity among the independent variables. This means that the independent variables should not be too highly correlated with each other.
#correlation between variables
corrplot(cor(data[2:length(data)] %>% select_if(is.numeric)), type = "upper", method = "number", tl.cex = 0.5, tl.col="black",number.cex = .5)
Assumption 4 - Logistic regression assumes linearity of independent variables and log odds.
I tested linearity in the logit by running Box-Tidwell Transformation Test. I added to the logistic model interaction terms which are the crossproduct of each independent times its natural logarithm ( (X)ln(X)] ). If these terms are significant, then there is non-linearity in the logit.
#count negative values and 0s
count_neg(data)
## variable_name_column number_negative_column percentage
## 1 Pclass 0 0
## 2 Last_Name 0 0
## 3 Salutation 0 0
## 4 First_Name 0 0
## 5 Sex 0 0
## 6 SibSp 0 0
## 7 Parch 0 0
## 8 Ticket 0 0
## 9 Fare 0 0
## 10 Age 0 0
## 11 Embarked 0 0
## 12 Floor 0 0
## 13 Room 0 0
## 14 with_relatives 0 0
count_nas(data)
## variable_name_column number_missing_column percentage
## 1 Survived 0 0
## 2 Pclass 0 0
## 3 Last_Name 0 0
## 4 Salutation 0 0
## 5 First_Name 0 0
## 6 Sex 0 0
## 7 SibSp 0 0
## 8 Parch 0 0
## 9 Ticket 0 0
## 10 Fare 0 0
## 11 Age 0 0
## 12 Embarked 0 0
## 13 Floor 0 0
## 14 Room 0 0
## 15 with_relatives 0 0
count_zeros(data)
## variable_name_column number_zero_column percentage
## 1 Parch 678 76.09
## 2 SibSp 608 68.24
## 3 Fare 15 1.68
## 4 Pclass 0 0.00
## 5 Last_Name 0 0.00
## 6 Salutation 0 0.00
## 7 First_Name 0 0.00
## 8 Sex 0 0.00
## 9 Ticket 0 0.00
## 10 Age 0 0.00
## 11 Embarked 0 0.00
## 12 Floor 0 0.00
## 13 Room 0 0.00
## 14 with_relatives 0 0.00
#solve problem with 0s as Box Tidwell test doesn't accept 0s
#data <- data %>% mutate(Fare = Fare + 0.001,Room = Room + 0.001, Parch = Parch + 0.001, SibSp = SibSp + 0.001)
#create dataframe to store modified data
data_modified <- data
#resolve problem with non-negative values
for (i in 2:(ncol(data_modified)-1)){
for (j in 1:nrow(data_modified)){
if (is.numeric(data_modified[j,i])==TRUE && data_modified[j,i] <= 0 ) {
data_modified[j,i]=data_modified[j,i]+0.01}
}
}
head(data_modified)
## PassengerId Survived Pclass Last_Name Salutation
## 1 1 0 Lower Braund Mr
## 2 2 1 Upper Cumings Mrs
## 3 3 1 Lower Heikkinen Miss
## 4 4 1 Upper Futrelle Mrs
## 5 5 0 Lower Allen Mr
## 6 6 0 Lower Moran Mr
## First_Name Sex SibSp Parch
## 1 Owen Harris male 1.00 0.01
## 2 John Bradley (Florence Briggs Thayer) female 1.00 0.01
## 3 Laina female 0.01 0.01
## 4 Jacques Heath (Lily May Peel) female 1.00 0.01
## 5 William Henry male 0.01 0.01
## 6 James male 0.01 0.01
## Ticket Fare Age Embarked Floor Room with_relatives
## 1 A/5 21171 7.2500 22 Southampton 3 15 No
## 2 PC 17599 71.2833 38 Cherbourg 1 85 No
## 3 STON/O2. 3101282 7.9250 26 Southampton 3 35 Yes
## 4 113803 53.1000 35 Southampton 1 123 No
## 5 373450 8.0500 35 Southampton 3 124 Yes
## 6 330877 8.4583 36 Queenstown 3 101 Yes
#data transformation
for (i in 2:ncol(data_modified)) {
if (colnames(data_modified[i])!="OverallQual" || colnames(data_modified[i])!="GarageCars"){
if (is.numeric(data_modified[,i])==TRUE){
print(colnames(data_modified[i]))
Box = boxcox(data_modified[,i] ~ 1, # Transform Turbidity as a single vector
lambda = seq(-10,10,1) # Try values -10 to 10 by 0.1
)
Cox = data.frame(Box$x, Box$y) # Create a data frame with the results
Cox2 = Cox[with(Cox, order(-Cox$Box.y)),] # Order the new data frame by decreasing y
#Cox2[1,] # Display the lambda with the greatest
# log likelihood
lambda = Cox2[1, "Box.x"] # Extract that lambda
print(lambda)
data_modified[,i] = (data_modified[,i] ^ lambda - 1)/lambda # Transform the original data
#plotNormalHistogram(data[,i],main=names(data)[i])
#qqnorm(data[,i],main=names(data)[i])
#qqline(data[,i])
}
}
}
## [1] "SibSp"
## [1] -0.5050505
## [1] "Parch"
## [1] -0.7070707
## [1] "Fare"
## [1] 0.3030303
## [1] "Age"
## [1] 0.7070707
## [1] "Room"
## [1] 0.5050505
ggplot(data, aes(x=Fare, fill=Survived, color=Survived)) +
geom_histogram(position="identity")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
par(mfrow=c(1,2))
ggplot(data, aes(x=Fare, fill=Survived, color=Survived)) +
geom_histogram(position="identity")+
stat_function(fun = dnorm, args = list(mean = mean(data$Age), sd = sd(data$Age)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data_modified, aes(x=Fare, fill=Survived, color=Survived)) +
geom_histogram(position="identity")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#remove variables First Name, Last Name and Ticket from dataset
data <- data %>% dplyr::select(-First_Name,-Last_Name,-Ticket,-PassengerId)
data_modified <- data_modified %>% dplyr::select(-First_Name,-Last_Name,-Ticket,-PassengerId)
Use stepwise approach to build the model.
The best three regression models were built based on the stepwise approach that analyses all combination of variables and selects the best regression model based on lowest AIC (Akaike’s criterion) value. Lower values of AIC indicate the preferred model, that is, the one with the fewest parameters that still provides an adequate fit to the data. The step function ignored redundant variables (the variables that are highly correlated).
#build glm model using stepwise approach
model.null = glm(Survived ~ 1,
data = data_modified,
family = binomial(link="logit")
)
model.full = glm(Survived ~ .,
data = data_modified,
family = binomial(link="logit")
)
step(model.null,
scope = list(upper=model.full),
direction = "both",
test = "Chisq",
data = data_modified)
## Start: AIC=1188.66
## Survived ~ 1
##
## Df Deviance AIC LRT Pr(>Chi)
## + Salutation 16 869.38 903.38 317.28 < 2.2e-16 ***
## + Sex 1 917.80 921.80 268.85 < 2.2e-16 ***
## + Pclass 2 1083.11 1089.11 103.55 < 2.2e-16 ***
## + Floor 2 1083.11 1089.11 103.55 < 2.2e-16 ***
## + Fare 1 1089.52 1093.52 97.13 < 2.2e-16 ***
## + with_relatives 1 1149.96 1153.96 36.70 1.381e-09 ***
## + Embarked 2 1161.29 1167.29 25.36 3.107e-06 ***
## + Parch 1 1167.81 1171.81 18.84 1.421e-05 ***
## + SibSp 1 1175.63 1179.63 11.02 0.0009007 ***
## + Age 1 1176.87 1180.87 9.79 0.0017549 **
## + Room 1 1180.48 1184.48 6.17 0.0129727 *
## <none> 1186.66 1188.66
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=903.38
## Survived ~ Salutation
##
## Df Deviance AIC LRT Pr(>Chi)
## + Pclass 2 772.04 810.04 97.34 < 2.2e-16 ***
## + Floor 2 772.04 810.04 97.34 < 2.2e-16 ***
## + Fare 1 834.72 870.72 34.66 3.923e-09 ***
## + Embarked 2 851.63 889.63 17.75 0.0001398 ***
## + SibSp 1 863.49 899.49 5.88 0.0152746 *
## + Parch 1 863.55 899.55 5.83 0.0157791 *
## <none> 869.38 903.38
## + Sex 1 867.46 903.46 1.92 0.1655766
## + with_relatives 1 867.88 903.88 1.50 0.2210173
## + Room 1 868.17 904.17 1.20 0.2727701
## + Age 1 868.86 904.86 0.52 0.4725165
## - Salutation 16 1186.66 1188.66 317.28 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=810.04
## Survived ~ Salutation + Pclass
##
## Df Deviance AIC LRT Pr(>Chi)
## + Age 1 759.76 799.76 12.277 0.0004585 ***
## + SibSp 1 763.93 803.93 8.107 0.0044098 **
## + Parch 1 764.31 804.31 7.726 0.0054431 **
## + Embarked 2 764.13 806.13 7.906 0.0192013 *
## + with_relatives 1 766.49 806.49 5.548 0.0184992 *
## <none> 772.04 810.04
## + Sex 1 770.44 810.44 1.600 0.2059046
## + Room 1 771.12 811.12 0.919 0.3377892
## + Fare 1 771.40 811.40 0.644 0.4222235
## - Pclass 2 869.38 903.38 97.338 < 2.2e-16 ***
## - Salutation 16 1083.11 1089.11 311.068 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=799.76
## Survived ~ Salutation + Pclass + Age
##
## Df Deviance AIC LRT Pr(>Chi)
## + Parch 1 744.52 786.52 15.243 9.451e-05 ***
## + SibSp 1 745.71 787.71 14.057 0.0001774 ***
## + with_relatives 1 746.89 788.89 12.869 0.0003341 ***
## + Embarked 2 752.53 796.53 7.237 0.0268183 *
## + Fare 1 757.61 799.61 2.153 0.1423252
## <none> 759.76 799.76
## + Sex 1 758.04 800.04 1.719 0.1898702
## + Room 1 759.68 801.68 0.081 0.7757319
## - Age 1 772.04 810.04 12.277 0.0004585 ***
## - Pclass 2 868.86 904.86 109.099 < 2.2e-16 ***
## - Salutation 16 1019.44 1027.44 259.678 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=786.52
## Survived ~ Salutation + Pclass + Age + Parch
##
## Df Deviance AIC LRT Pr(>Chi)
## + SibSp 1 735.79 779.79 8.733 0.003124 **
## + Embarked 2 738.01 784.01 6.511 0.038559 *
## + with_relatives 1 741.90 785.90 2.615 0.105875
## <none> 744.52 786.52
## + Sex 1 742.77 786.77 1.751 0.185757
## + Room 1 744.01 788.01 0.506 0.477087
## + Fare 1 744.51 788.51 0.006 0.939742
## - Parch 1 759.76 799.76 15.243 9.451e-05 ***
## - Age 1 764.31 804.31 19.794 8.623e-06 ***
## - Pclass 2 863.53 901.53 119.007 < 2.2e-16 ***
## - Salutation 16 1017.41 1027.41 272.895 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=779.79
## Survived ~ Salutation + Pclass + Age + Parch + SibSp
##
## Df Deviance AIC LRT Pr(>Chi)
## + Embarked 2 729.89 777.89 5.896 0.052449 .
## <none> 735.79 779.79
## + Sex 1 734.28 780.28 1.507 0.219551
## + with_relatives 1 734.30 780.30 1.486 0.222835
## + Fare 1 735.12 781.12 0.666 0.414442
## + Room 1 735.39 781.39 0.393 0.530565
## - SibSp 1 744.52 786.52 8.733 0.003124 **
## - Parch 1 745.71 787.71 9.920 0.001635 **
## - Age 1 759.59 801.59 23.802 1.068e-06 ***
## - Pclass 2 860.33 900.33 124.548 < 2.2e-16 ***
## - Salutation 16 1017.38 1029.38 281.592 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=777.89
## Survived ~ Salutation + Pclass + Age + Parch + SibSp + Embarked
##
## Df Deviance AIC LRT Pr(>Chi)
## <none> 729.89 777.89
## + Sex 1 728.03 778.03 1.859 0.172693
## + Room 1 729.15 779.15 0.744 0.388538
## + with_relatives 1 729.21 779.21 0.682 0.409042
## + Fare 1 729.65 779.65 0.240 0.624159
## - Embarked 2 735.79 779.79 5.896 0.052449 .
## - SibSp 1 738.01 784.01 8.118 0.004383 **
## - Parch 1 739.59 785.59 9.700 0.001843 **
## - Age 1 752.33 798.33 22.442 2.166e-06 ***
## - Pclass 2 840.88 884.88 110.993 < 2.2e-16 ***
## - Salutation 16 999.72 1015.72 269.832 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Call: glm(formula = Survived ~ Salutation + Pclass + Age + Parch +
## SibSp + Embarked, family = binomial(link = "logit"), data = data_modified)
##
## Coefficients:
## (Intercept) SalutationCol SalutationDon
## -15.67628 14.39604 -3.10836
## SalutationDr SalutationJonkheer SalutationLady
## 14.16814 -2.60442 30.96503
## SalutationMajor SalutationMaster SalutationMiss
## 14.35208 16.21931 16.38599
## SalutationMlle SalutationMme SalutationMr
## 29.36970 29.36970 13.70258
## SalutationMrs SalutationMs SalutationRev
## 17.66501 31.48623 -0.94928
## SalutationSir Salutationthe Countess PclassMiddle
## 31.00108 30.33017 1.37437
## PclassUpper Age Parch
## 2.73698 -0.11237 -0.02369
## SibSp Embarked2 Embarked3
## -0.03524 -0.23957 -0.58078
##
## Degrees of Freedom: 890 Total (i.e. Null); 867 Residual
## Null Deviance: 1187
## Residual Deviance: 729.9 AIC: 777.9
Test Goodness of Fit.
Deviance analysis, Likelihood Ratio Test, Pseudo R^2 Test and Hosmer-Lemeshow Test were performed to test goodness of fit of the regression.
A logistic regression is said to provide a better fit to the data if it demonstrates an improvement over a model with fewer predictors. This is performed using the likelihood ratio test, which compares the likelihood of the data under the full model (Model #1) against the likelihood of the data under a model with fewer predictors (Model #3). Removing predictor variables from a model will almost always make the model fit less well, but it is necessary to test whether the observed difference in model fit is statistically significant.
The following hypothesis were stated:
The Null Hypothesis: reduced model is true. The Alternative Hypothesis: reduced model is false.
#final model
final.model <- glm(formula = Survived ~ Salutation + Pclass + SibSp + Age +
Parch + Embarked + Room, family = binomial(link = "logit"), data = data)
summary(final.model)
##
## Call:
## glm(formula = Survived ~ Salutation + Pclass + SibSp + Age +
## Parch + Embarked + Room, family = binomial(link = "logit"),
## data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4786 -0.5452 -0.3381 0.5042 2.5561
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.481e+01 2.400e+03 -0.006 0.9951
## SalutationCol 1.491e+01 2.400e+03 0.006 0.9950
## SalutationDon -2.688e+00 3.393e+03 -0.001 0.9994
## SalutationDr 1.479e+01 2.400e+03 0.006 0.9951
## SalutationJonkheer -2.301e+00 3.393e+03 -0.001 0.9995
## SalutationLady 3.142e+01 3.393e+03 0.009 0.9926
## SalutationMajor 1.479e+01 2.400e+03 0.006 0.9951
## SalutationMaster 1.723e+01 2.400e+03 0.007 0.9943
## SalutationMiss 1.686e+01 2.400e+03 0.007 0.9944
## SalutationMlle 2.986e+01 2.939e+03 0.010 0.9919
## SalutationMme 2.981e+01 3.393e+03 0.009 0.9930
## SalutationMr 1.407e+01 2.400e+03 0.006 0.9953
## SalutationMrs 1.797e+01 2.400e+03 0.007 0.9940
## SalutationMs 3.186e+01 3.393e+03 0.009 0.9925
## SalutationRev -5.747e-01 2.583e+03 0.000 0.9998
## SalutationSir 3.147e+01 3.393e+03 0.009 0.9926
## Salutationthe Countess 3.069e+01 3.393e+03 0.009 0.9928
## PclassMiddle 1.175e+00 2.569e-01 4.575 4.75e-06 ***
## PclassUpper 2.674e+00 2.922e-01 9.152 < 2e-16 ***
## SibSp -6.056e-01 1.277e-01 -4.743 2.10e-06 ***
## Age -4.333e-02 9.268e-03 -4.675 2.94e-06 ***
## Parch -3.340e-01 1.352e-01 -2.469 0.0135 *
## Embarked2 -9.415e-02 4.116e-01 -0.229 0.8191
## Embarked3 -4.181e-01 2.545e-01 -1.643 0.1004
## Room -1.893e-03 2.970e-03 -0.638 0.5238
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.66 on 890 degrees of freedom
## Residual deviance: 704.69 on 866 degrees of freedom
## AIC: 754.69
##
## Number of Fisher Scoring iterations: 15
#reduced model with fewer parameters
model2 <- glm(formula = Survived ~ Salutation + Pclass + SibSp + Parch + Fare,
family = binomial(link = "logit"), data = data)
model3 <- glm(formula = Survived ~ Salutation + Pclass + SibSp,
family = binomial(link = "logit"), data = data)
#Likelihood Ratio Test
anova(final.model, model2, test ="Chisq")
## Analysis of Deviance Table
##
## Model 1: Survived ~ Salutation + Pclass + SibSp + Age + Parch + Embarked +
## Room
## Model 2: Survived ~ Salutation + Pclass + SibSp + Parch + Fare
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 866 704.69
## 2 869 731.06 -3 -26.372 7.971e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(final.model, model3, test ="Chisq")
## Analysis of Deviance Table
##
## Model 1: Survived ~ Salutation + Pclass + SibSp + Age + Parch + Embarked +
## Room
## Model 2: Survived ~ Salutation + Pclass + SibSp
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 866 704.69
## 2 871 739.90 -5 -35.211 1.366e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The results of Likelihood Ratio Test shows that null hypothesis is rejected (as p-value is less than significance level of 5%). It would provide evidence against the reduced model in favor of the final model.
The residual deviance, that shows how well the response is predicted by the model when the predictors are included, was used to test whether logistic regression model provides an adequate fit for the data or not.
The following hypothesis were stated:
The Null Hypothesis: logistic regression model provides an adequate fit for the data. The Alternative Hypothesis: logistic regression model doesn’t provide an adequate fit for the data.
#residual deviance test
p_value = 1 - pchisq(final.model$deviance,final.model$df.residual)
p_value
## [1] 0.9999819
As p-value is a little bit greater than the significance level of 5% the null hypothesis is accepted. It suggests that the fitted values are not significantly different from the observed values.
Unlike linear regression with ordinary least squares estimation, there is no R-squire statistic, which explains the proportion of variance in the dependent variable that is explained by the predictors. However, there are a number of pseudo R2-squire metrics that could be of value. Most notable is McFadden’s R-squire. It measures ranges from 0 to just less than 1, with values closer to zero indicating that the model has no predictive power.
#Pseudo R^2 Test
pR2(final.model)
## llh llhNull G2 McFadden r2ML
## -352.3442170 -593.3275684 481.9667028 0.4061557 0.4177922
## r2CU
## 0.5676488
The results of the McFadden’s R-squire test shows that the model has a quite strong predictive power since the value of r2CU is close to 1 (r2CU equals to 0.8139615).
Another approach to determining the goodness of fit is through the Homer-Lemeshow statistics, which is computed on data after the observations have been segmented into groups based on having similar predicted probabilities. It examines whether the observed proportions of events are similar to the predicted probabilities of occurrence in subgroups of the data set using a Pearson chi square test.
The following hypothesis were stated:
The Null Hypothesis: there is a good fit of the model. The Alternative Hypothesis: there is a poor fit of the model.
#Hosmer-Lemeshow Test
hoslem.test(data$Survived, fitted(final.model), g=10)
## Warning in Ops.factor(1, y): '-' not meaningful for factors
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: data$Survived, fitted(final.model)
## X-squared = 891, df = 8, p-value < 2.2e-16
Since p-vale is than significance level of 5% the null hypothesis is accepted. Hosmer-Lemeshow Test proves a good fit to the model.
Create a dummy variables
#create a dummy variables for Pclass="Middle"
data$Pclass_Middle <- ifelse(data$Pclass == "Middle",1,0)
data$Pclass_Upper <- ifelse(data$Pclass == "Upper",1,0)
data$Pclass_Middle <- ifelse(data$Pclass == "Middle",1,0)
data$Pclass_Upper <- ifelse(data$Pclass == "Upper",1,0)
data$Salutation_Col <- ifelse(data$Salutation == "Col",1,0)
data$Salutation_Don <- ifelse(data$Salutation == "Don",1,0)
data$Salutation_Dr <- ifelse(data$Salutation == "Dr",1,0)
data$Salutation_Jonkheer <- ifelse(data$Salutation == "Jonkheer",1,0)
data$Salutation_Lady <- ifelse(data$Salutation == "Lady",1,0)
data$Salutation_Major <- ifelse(data$Salutation == "Major",1,0)
data$Salutation_Master <- ifelse(data$Salutation == "Master",1,0)
data$Salutation_Miss <- ifelse(data$Salutation == "Miss",1,0)
data$Salutation_Mlle <- ifelse(data$Salutation == "Mlle",1,0)
data$Salutation_Mme <- ifelse(data$Salutation == "Mme",1,0)
data$Salutation_Mr <- ifelse(data$Salutation == "Mr",1,0)
data$Salutation_Mrs <- ifelse(data$Salutation == "Mrs",1,0)
data$Salutation_Ms <- ifelse(data$Salutation == "Ms",1,0)
data$Salutation_Rev <- ifelse(data$Salutation == "Rev",1,0)
data$Salutation_Sir <- ifelse(data$Salutation == "Sir",1,0)
data$Salutation_theCountess <- ifelse(data$Salutation == "the Countess",1,0)
Predict ‘Survived’ class for testing and training data sets.
#create a new variable 'probability'
data$probability <- c()
data_testing$probability <- c()
#calculate logit function
logit_p <-1.170e+00*data$Pclass_Middle+2.523e+00*data$Pclass_Upper-5.814e-01*data$SibSp-3.722e-02*data$Age-3.206e-01*data$Parch
#logit_p_testing <- 0.8597*data$Pclass_Middle + 1.5757*data$Pclass_Upper -0.6767*data$SibSp - 0.4269*data$Parch + 0.5131*data$Fare+ 2.0799*data$Age
#calculate probability
data$probability <- exp(1)^logit_p/(1+exp(1)^logit_p)
#data_testing$probability <- exp(1)^logit_p_testing/(1+exp(1)^logit_p_testing)
head(data)
## Survived Pclass Salutation Sex SibSp Parch Fare Age Embarked
## 1 0 Lower Mr male 1 0 7.2500 22 Southampton
## 2 1 Upper Mrs female 1 0 71.2833 38 Cherbourg
## 3 1 Lower Miss female 0 0 7.9250 26 Southampton
## 4 1 Upper Mrs female 1 0 53.1000 35 Southampton
## 5 0 Lower Mr male 0 0 8.0500 35 Southampton
## 6 0 Lower Mr male 0 0 8.4583 36 Queenstown
## Floor Room with_relatives Pclass_Middle Pclass_Upper Salutation_Col
## 1 3 15 No 0 0 0
## 2 1 85 No 0 1 0
## 3 3 35 Yes 0 0 0
## 4 1 123 No 0 1 0
## 5 3 124 Yes 0 0 0
## 6 3 101 Yes 0 0 0
## Salutation_Don Salutation_Dr Salutation_Jonkheer Salutation_Lady
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## Salutation_Major Salutation_Master Salutation_Miss Salutation_Mlle
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 1 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## Salutation_Mme Salutation_Mr Salutation_Mrs Salutation_Ms Salutation_Rev
## 1 0 1 0 0 0
## 2 0 0 1 0 0
## 3 0 0 0 0 0
## 4 0 0 1 0 0
## 5 0 1 0 0 0
## 6 0 1 0 0 0
## Salutation_Sir Salutation_theCountess probability
## 1 0 0 0.1977780
## 2 0 0 0.6288392
## 3 0 0 0.2753352
## 4 0 0 0.6545048
## 5 0 0 0.2137110
## 6 0 0 0.2075232
#create a new variable that specifies predicted class
#data_testing$target_pred <-c()
head(data_testing)
## PassengerId Pclass Name Sex
## 1 892 3 Kelly, Mr. James male
## 2 893 3 Wilkes, Mrs. James (Ellen Needs) female
## 3 894 2 Myles, Mr. Thomas Francis male
## 4 895 3 Wirz, Mr. Albert male
## 5 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female
## 6 897 3 Svensson, Mr. Johan Cervin male
## Age SibSp Parch Ticket Fare Cabin Embarked
## 1 34.5 0 0 330911 7.8292 <NA> Q
## 2 47.0 1 0 363272 7.0000 <NA> S
## 3 62.0 0 0 240276 9.6875 <NA> Q
## 4 27.0 0 0 315154 8.6625 <NA> S
## 5 22.0 1 1 3101298 12.2875 <NA> S
## 6 14.0 0 0 7538 9.2250 <NA> S
#calculate probability
data = within(data, {
Survived_pred = ifelse(data$probability < 0.5, 0, 1)
})
#data_testing = within(data_testing, {
# target_pred = ifelse(data_testing$probability < 0.5, 0, 1)
# })
#head(data_testing)
Calculate Classification Metrics.
The table below displays values of classification metrics that were generated by custom functions discussed in the previous sections.
#create confusion matrix
confusion_matrix <- table("Predicted" = data$Survived_pred, "Actual" = data$Survived)
confusion_matrix
## Actual
## Predicted 0 1
## 0 443 191
## 1 106 151
#calculate true positive
TP <- confusion_matrix[4]
#calculate true negative
TN <- confusion_matrix[1]
#calculate false negative
FN <- confusion_matrix[2]
#calculate false positive
FP <- confusion_matrix[3]
#calculate accuracy
accuracy <- (confusion_matrix[1,1] + confusion_matrix[2,2])/nrow(data)
accuracy
## [1] 0.6666667
#calculate accuracy classification error rate
classification_error_rate = (FP + FN)/(TP + FP + TN + FN)
classification_error_rate
## [1] 0.3333333
#calculate precision
precision = TP/(TP + FP)
precision
## [1] 0.4415205
#calculate sensitivity
sensitivity = TP/(TP + FN)
sensitivity
## [1] 0.5875486
#calculate specificity
specificity <- TN/(TN + FP)
specificity
## [1] 0.6987382
#calculate F1 score
F1_score <- (2*precision*sensitivity)/(precision + sensitivity)
F1_score
## [1] 0.5041736
roc.val <- roc(Survived~probability, data)
plot(roc.val, main="ROC plot")
roc.val$auc
## Area under the curve: 0.709
Run different classifiers.
#create vectors to store accuracies values and errors values
classifier_name <-c()
Accuracy_mean <- c()
Accuracy_upper <- c()
Accuracy_lower <- c()
Kappa <- c()
MAE <- c()
RMSE <- c()
RAE <- c()
RRSE <- c()
classifier_name[1] <-c("J48")
#create empty vectors to store accuracies values and errors values for a certain classifier
pctCorrect_vector <- c()
Kappa_vector <- c()
MAE_vector <- c()
RMSE_vector <- c()
RAE_vector <- c()
RRSE_vector <- c()
for (i in 1:50){
#set the seed of R‘s random number generator
set.seed(i)
resultJ48 <- J48(Survived ~., data)
evaluation <- evaluate_Weka_classifier(resultJ48)$details
pctCorrect_value <- evaluation["pctCorrect"]
Kappa_value <- evaluation["kappa"]
MAE_value <- evaluation["meanAbsoluteError"]
RMSE_value <- evaluation["rootMeanSquaredError"]
RAE_value <- evaluation["rootMeanSquaredError"]
RRSE_value <- evaluation["rootRelativeSquaredError"]
#add the value generated after each iteration to the vector
pctCorrect_vector <- c(pctCorrect_vector,pctCorrect_value)
Kappa_vector <- c(Kappa_vector, Kappa_value)
MAE_vector <- c(MAE_vector, MAE_value)
RMSE_vector <- c(RMSE_vector, RMSE_value)
RAE_vector <- c(RAE_vector, RAE_value)
RRSE_vector <- c(RRSE_vector, RRSE_value)
}
#calculate mean and standard deviation of accuracies and all errors
Accuracy_mean[1] <- mean(pctCorrect_vector)
pctCorrect_sd <- sd(pctCorrect_vector)
Accuracy_upper[1] <-Accuracy_mean[1] + pctCorrect_sd
Accuracy_lower <-Accuracy_mean[1] - pctCorrect_sd
Kappa[1] <- mean(Kappa_vector)
MAE[1] <- mean(MAE_vector)
RMSE[1] <- mean(RMSE_vector)
RAE[1] <- mean(RAE_vector)
RRSE[1] <- mean(RRSE_vector)
classifier_name[2] <-c("Naive Bayes")
#create empty vectors to store accuracies values and errors values
pctCorrect_vector <- c()
Kappa_vector <- c()
MAE_vector <- c()
RMSE_vector <- c()
RAE_vector <- c()
RRSE_vector <- c()
for (i in 1:50){
set.seed(i)
#create 66% training data set
NB <- make_Weka_classifier("weka/classifiers/bayes/NaiveBayes")
result_NaiveBayes <- NB(Survived ~., data)
evaluation <- evaluate_Weka_classifier(result_NaiveBayes)$details
pctCorrect_value <- evaluation["pctCorrect"]
Kappa_value <- evaluation["kappa"]
MAE_value <- evaluation["meanAbsoluteError"]
RMSE_value <- evaluation["rootMeanSquaredError"]
RAE_value <- evaluation["rootMeanSquaredError"]
RRSE_value <- evaluation["rootRelativeSquaredError"]
#add the value generated after each iteration to the vector
pctCorrect_vector <- c(pctCorrect_vector,pctCorrect_value)
Kappa_vector <- c(Kappa_vector, Kappa_value)
MAE_vector <- c(MAE_vector, MAE_value)
RMSE_vector <- c(RMSE_vector, RMSE_value)
RAE_vector <- c(RAE_vector, RAE_value)
RRSE_vector <- c(RRSE_vector, RRSE_value)
}
#calculate mean and standard deviation of accuracies and all errors
Accuracy_mean[2] <- mean(pctCorrect_vector)
pctCorrect_sd[2] <- sd(pctCorrect_vector)
Accuracy_upper[2] <-Accuracy_mean[2] + pctCorrect_sd[2]
Accuracy_lower[2] <-Accuracy_mean[2] - pctCorrect_sd[2]
Kappa[2] <- mean(Kappa_vector)
MAE[2] <- mean(MAE_vector)
RMSE[2] <- mean(RMSE_vector)
RAE[2] <- mean(RAE_vector)
RRSE[2] <- mean(RRSE_vector)
classifier_name[3] <-c("Knn")
#ignore the loop for this classifier since it loads very slow
#for (i in 1:50){
#set.seed(i)
knn <- IBk(Survived ~., data)
evaluation <- evaluate_Weka_classifier(knn)$details
pctCorrect_value <- evaluation["pctCorrect"]
Kappa_value <- evaluation["Kappa"]
MAE_value <- evaluation["meanAbsoluteError"]
RMSE_value <- evaluation["rootMeanSquaredError"]
RAE_value <- evaluation["rootMeanSquaredError"]
RRSE_value <- evaluation["rootRelativeSquaredError"]
#}
#calculate mean and standard deviation of accuracies and all errors
Accuracy_mean[3] <- mean(pctCorrect_value)
pctCorrect_sd[3] <- sd(pctCorrect_value)
Accuracy_upper[3] <- ""
Accuracy_lower[3] <- ""
Kappa[3] <- mean(Kappa_value)
MAE[3] <- mean(MAE_value)
RMSE[3] <- mean(RMSE_value)
RAE[3] <- mean(RAE_value)
RRSE[3] <- mean(RRSE_value)
#confusion Matrix
evaluate_Weka_classifier(knn)$confusionMatrix
## predicted
## 0 1
## 0 549 0
## 1 0 342
#ROC
plot(roc(data$Survived, predict(knn, data, type = "prob")[,2]))
classifier_name[4] <-c("SVM")
#ignore the loop for this classifier since it loads very slow
#for (i in 1:50){
#set.seed(i)
#create 66% training data set
SVM <- SMO(Survived ~., data)
evaluation <- evaluate_Weka_classifier(SVM)$details
pctCorrect_value <- evaluation["pctCorrect"]
Kappa_value <- evaluation["kappa"]
MAE_value <- evaluation["meanAbsoluteError"]
RMSE_value <- evaluation["rootMeanSquaredError"]
RAE_value <- evaluation["rootMeanSquaredError"]
RRSE_value <- evaluation["rootRelativeSquaredError"]
#}
#calculate mean and standard deviation of accuracies and all errors
Accuracy_mean[4] <- mean(pctCorrect_value)
#pctCorrect_sd[4] <- sd(pctCorrect_value)
Accuracy_upper[4] <- ""
Accuracy_lower[4] <- ""
Kappa[4] <- mean(Kappa_value)
MAE[4] <- mean(MAE_value)
RMSE[4] <- mean(RMSE_value)
RAE[4] <- mean(RAE_value)
RRSE[4] <- mean(RRSE_value)
data_table <- data.frame(classifier_name, Accuracy_mean, Accuracy_upper, Accuracy_lower, Kappa, MAE, RMSE, RAE, RRSE)
data_table
## classifier_name Accuracy_mean Accuracy_upper Accuracy_lower
## 1 J48 88.21549 88.2154882154882 88.2154882154882
## 2 Naive Bayes 80.02245 80.0224466891134 80.0224466891134
## 3 Knn 100.00000
## 4 SVM 83.27722
## Kappa MAE RMSE RAE RRSE
## 1 0.7445255 0.195435065 0.312598036 0.312598036 64.2783414
## 2 0.5863186 0.205613384 0.395207490 0.395207490 81.2650081
## 3 NA 0.001098478 0.001103904 0.001103904 0.2269916
## 4 0.6411211 0.167227834 0.408935000 0.408935000 84.0877436
#rename accuracy_mean column and divide by 100
#converting wide data format to long data
data_table <- data_table %>% dplyr::rename(Accuracy = Accuracy_mean) %>% dplyr::mutate(Accuracy = Accuracy/100) %>% dplyr::select(-Accuracy_upper,-Accuracy_lower,-Kappa,-RRSE)
data_table
## classifier_name Accuracy MAE RMSE RAE
## 1 J48 0.8821549 0.195435065 0.312598036 0.312598036
## 2 Naive Bayes 0.8002245 0.205613384 0.395207490 0.395207490
## 3 Knn 1.0000000 0.001098478 0.001103904 0.001103904
## 4 SVM 0.8327722 0.167227834 0.408935000 0.408935000
data1 <- data_table %>% tidyr::gather(accuracy_error,value,colnames(data_table)[2]:colnames(data_table)[5])
ggplot(data1, aes(factor(accuracy_error), value, fill = classifier_name)) + geom_bar(stat="identity", position = "dodge")
#+ scale_fill_brewer(palette = "Set1")