library(data.table)
library(knitr)
# I organize my project folders according to the website below. I can email you this project directory.
# https://martinctc.github.io/blog/rstudio-projects-and-working-directories-a-beginner's-guide/
#
# to run this, a specific "Project" folder must contain a "Data" folder
# the "Data" folder must contain a "raw" and "r" folder as its been provided in our class scripts
#
# .Rmd files are weird in projects. This file will exist in the "Project" folder, on the same level
# as the "Data" folder.
#
# this is all done according to the website above.
#
# email me at selfes@bc.edu and I can send the entire project folder.
data <- "Data"
dir.exists(data)
[1] TRUE
rawData <- file.path(data, "raw")
rData <- file.path(data, "r")
list.files(rawData)
[1] "Titanic_test.csv" "Titanic_train.csv"
ttr <- read.csv(file.path(rawData, "Titanic_train.csv"), stringsAsFactors = F)
object.size(ttr)
193176 bytes
# Save Titanic training dataset in RDS format
saveRDS(ttr, file.path(rData, "ttr.rds"))
# Save Titanic training dataset in fst format
library(fst)
write_fst(ttr, file.path(rData, "ttr.fst"))
ttr2 <- data.table(ttr)
colnames(ttr2)
[1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
[6] "Age" "SibSp" "Parch" "Ticket" "Fare"
[11] "Cabin" "Embarked"
library(Amelia)
library(Hmisc)
missmap(ttr2)
#sapply(ttr2, function(x) sum(is.na(x))) # what are we iterating over? columns of ttr2
colSums(is.na(ttr2))
PassengerId Survived Pclass Name Sex Age
0 0 0 0 0 177
SibSp Parch Ticket Fare Cabin Embarked
0 0 0 0 0 0
# Filtering by 'Age' for bar plots below
library(dplyr)
all_missing <- ttr2 %>% filter(is.na(ttr2$Age))
all_but_missing <- ttr2 %>% filter(! is.na(ttr2$Age))
ggplot(data=ttr2) +
geom_histogram(mapping=aes(x=Age),col="black")
# Replacing NA's with mean value
mean(ttr2$Age, na.rm=T)
[1] 29.69912
ttr2$Age <- as.numeric(impute(ttr2$Age, mean))
The only missing values are for the variable ‘Age.’ Given the above histogram, a reasonable way to handle the missing values will be to replace them with the mean.
The original dataset was split into all_missing, which contains only the observations with missing ‘Age’ values, and all_but_missing, which contains only the observations with no missing values. These two subsets are used to create the following two barplots below. The top is of all_missing and the bottom is all_but_missing. For both of these plots, the left plot (0) shows those who died and the right (1) shows those who lived.
library(ggplot2)
library(gridExtra)
sum(all_missing$Sex == "male")
[1] 124
sum(all_missing$Sex == "female")
[1] 53
ggplot(data=all_missing) +
geom_bar(mapping = aes(x=Pclass, fill=Sex), binwidth = 0.5, col="black") +
xlab("Class") + ggtitle("all_missing -- Class, Sex, Survived") +
scale_x_continuous(breaks=scales::pretty_breaks(n = 3)) +
scale_y_continuous(breaks=scales::pretty_breaks(n = 15)) +
facet_wrap(~ Survived) +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=12,face="bold"),
legend.position = c(0.6, 0.9))
ggplot(data=all_but_missing) +
geom_bar(mapping = aes(x=Pclass, fill=Sex), binwidth = 0.5, col="black") +
xlab("Class") + ggtitle("all_but_missing -- Class, Sex, Survived") +
scale_x_continuous(breaks=scales::pretty_breaks(n = 3)) +
scale_y_continuous(breaks=scales::pretty_breaks(n = 15)) +
facet_wrap(~ Survived) +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=12,face="bold"),
legend.position = c(0.6, 0.9))
# PassengerId - Qualitative, Ordinal
# Survived - - Qualitative, Nominal
# Pclass- - - Qualitative, Ordinal
# Name - - - Qualitative, Nominal
# Sex - - - Qualitative, Nominal
# Age - - - Quantitative, Ratio
# SibSp - - - Qualitative, Ordinal
# Parch - - - Quantitative, Ratio
# Ticket- - - Qualitative, Nominal
# Fare - - - Quantitative, Interval
# Cabin - - - Qualitative, Nominal
# Embarked - - Qualitative, Nominal
# Age
summary(ttr2$Age)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.42 22.00 29.70 29.70 35.00 80.00
ggplot(data=ttr2) +
geom_histogram(mapping=aes(x=Age), fill="cornflowerblue", col="black") +
ggtitle("AGE") +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=12,face="bold"))
# Parch
summary(ttr2$Parch)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.3816 0.0000 6.0000
ggplot(data=ttr2) +
geom_histogram(mapping=aes(x=Parch), fill="lightgreen", binwidth=1, col="black") +
ggtitle("PARCH") +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=12,face="bold"))
# Fare
summary(ttr2$Fare)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 7.91 14.45 32.20 31.00 512.33
ggplot(data=ttr2) +
geom_histogram(mapping=aes(x=Fare), fill="chartreuse1", col="black") +
ggtitle("FARE") +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=12,face="bold"))
### ALL CATEGORICAL VARIABLES ---------
#
#
# PassengerId
# Survived
# Pclass
# Name
# Sex
# SibSp
# Ticket
# Cabin
# Embarked
#
#
#
### -----------------------------------
# PassengerId
###############
#
#
# This command counts the amount of unique values present for any variable
length(unique(ttr2$PassengerId))
[1] 891
#
#
# This finds the intersection of any observation in either comparison -- effectively does
# the same thing as the above command.
intersection <- ttr2$PassengerId %in% ttr2$PassengerId
sum(intersection == FALSE) # there are no two observations that're the same for PassengerId
[1] 0
sum(intersection == TRUE) # this means that every single observation is unique
[1] 891
#
#
# This command shows the 5 highest observations, after sorting all in descending order.
head(order(-ttr2$PassengerId), 5)
[1] 891 890 889 888 887
#
#
# This command finds the associated observation value associated with the descending ordered
# rank found above. It is better seen below with the variable 'Survived'.
ttr2$PassengerId[head(order(-ttr2$PassengerId), 5)]
[1] 891 890 889 888 887
# Survived
##############
#
#
# These are the exact commands as above. The rest are automated below.
length(unique(ttr2$Survived))
[1] 2
head(order(-ttr2$Survived)) # these are the index locations of the sorted values which must be swapped.
[1] 2 3 4 9 10 11
ttr2$Survived[head(order(-ttr2$Survived), 5)]
[1] 1 1 1 1 1
#
#
#
# Pclass
# Name
# Sex
# SibSp
# Ticket
# Cabin
# Embarked
#
#
#
#########################
# The following for-loop iterates and describes all variables - not just the categorical ones.
# The same essential techniques shown above are used in this loop, with the exception of the
# if-statement, which determines if the most frequently occurring values are repeated.
#
#
# In the output, the 'five most frequently occurring values' are synced with the line below,
# which tells how many occurrences there are for each.
#########################
i = 1
for (i in 1:ncol(ttr2)){
vb.name <- colnames(ttr2[,..i])
unq.val <- nrow(unique(ttr2[,..i]))
####
top.five <- as.numeric(unlist(ttr2[,..i][head(order(-ttr2[,..i]),5)]))
occur <- as.numeric(unlist(head(sort(table(ttr2[,..i]), decreasing=T),5)))
if (sum(head(sort(table(ttr2[,..i]), decreasing=T),5)) == 5){ # sum == 5, each value unique
five.freq = "NA"
}else {
five.freq <- names(head(sort(table(ttr2[,..i]), decreasing=T),5))
}
####
out <- paste0("The variable '", vb.name, "' has ", unq.val, " unique values.")
out2 <- paste0("The five highest values are: ", top.five[1], ", ", top.five[2], ", ", top.five[3],
", ", top.five[4], ", ", top.five[5])
out3 <- paste0("The five most frequently occuring values are: ", five.freq[1], ", ", five.freq[2],
", ", five.freq[3],", ", five.freq[4], ", ", five.freq[5])
out4 <- paste0("The amount of times these values occur are : ", occur[1], ", ", occur[2],
", ", occur[3],", ", occur[4], ", ", occur[5])
print(out)
print(out3)
print(out4)
print(out2)
print(" ")
print(" ")
print(" ")
i = i + 1
}
[1] "The variable 'PassengerId' has 891 unique values."
[1] "The five most frequently occuring values are: NA, NA, NA, NA, NA"
[1] "The amount of times these values occur are : 1, 1, 1, 1, 1"
[1] "The five highest values are: 891, 890, 889, 888, 887"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Survived' has 2 unique values."
[1] "The five most frequently occuring values are: 0, 1, NA, NA, NA"
[1] "The amount of times these values occur are : 549, 342, NA, NA, NA"
[1] "The five highest values are: 1, 1, 1, 1, 1"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Pclass' has 3 unique values."
[1] "The five most frequently occuring values are: 3, 1, 2, NA, NA"
[1] "The amount of times these values occur are : 491, 216, 184, NA, NA"
[1] "The five highest values are: 3, 3, 3, 3, 3"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Name' has 891 unique values."
[1] "The five most frequently occuring values are: NA, NA, NA, NA, NA"
[1] "The amount of times these values occur are : 1, 1, 1, 1, 1"
[1] "The five highest values are: NA, NA, NA, NA, NA"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Sex' has 2 unique values."
[1] "The five most frequently occuring values are: male, female, NA, NA, NA"
[1] "The amount of times these values occur are : 577, 314, NA, NA, NA"
[1] "The five highest values are: NA, NA, NA, NA, NA"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Age' has 89 unique values."
[1] "The five most frequently occuring values are: 29.6991176470588, 24, 22, 18, 19"
[1] "The amount of times these values occur are : 177, 30, 27, 26, 25"
[1] "The five highest values are: 80, 74, 71, 71, 70.5"
[1] " "
[1] " "
[1] " "
[1] "The variable 'SibSp' has 7 unique values."
[1] "The five most frequently occuring values are: 0, 1, 2, 4, 3"
[1] "The amount of times these values occur are : 608, 209, 28, 18, 16"
[1] "The five highest values are: 8, 8, 8, 8, 8"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Parch' has 7 unique values."
[1] "The five most frequently occuring values are: 0, 1, 2, 3, 5"
[1] "The amount of times these values occur are : 678, 118, 80, 5, 5"
[1] "The five highest values are: 6, 5, 5, 5, 5"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Ticket' has 681 unique values."
[1] "The five most frequently occuring values are: 1601, 347082, CA. 2343, 3101295, 347088"
[1] "The amount of times these values occur are : 7, 7, 7, 6, 6"
[1] "The five highest values are: NA, NA, NA, NA, NA"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Fare' has 248 unique values."
[1] "The five most frequently occuring values are: 8.05, 13, 7.8958, 7.75, 26"
[1] "The amount of times these values occur are : 43, 42, 38, 34, 31"
[1] "The five highest values are: 512.3292, 512.3292, 512.3292, 263, 263"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Cabin' has 148 unique values."
[1] "The five most frequently occuring values are: , B96 B98, C23 C25 C27, G6, C22 C26"
[1] "The amount of times these values occur are : 687, 4, 4, 4, 3"
[1] "The five highest values are: NA, NA, NA, NA, NA"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Embarked' has 4 unique values."
[1] "The five most frequently occuring values are: S, C, Q, , NA"
[1] "The amount of times these values occur are : 644, 168, 77, 2, NA"
[1] "The five highest values are: NA, NA, NA, NA, NA"
[1] " "
[1] " "
[1] " "
# box-plots
ggplot(data = ttr2) +
geom_boxplot(mapping = aes(y=Age, fill=as.factor(Survived))) +
facet_wrap(~ Sex) +
ggtitle("Boxplot of Age and Survival -- Separated by Sex (0 = died, 1 = survived)")+
guides(fill=guide_legend(title = "Survived")) +
theme(legend.position = c(0.2, 0.9),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
title = element_text(size=15, face="bold")) +
scale_y_continuous(breaks=scales::pretty_breaks(n = 15))
b1 <- ggplot(data=ttr2) +
geom_boxplot(mapping = aes(y=Fare, fill=Sex)) +
facet_wrap(~ Pclass) +
ylim(0, quantile(ttr2$Fare[ttr2$Pclass==1], 0.9)) +
ggtitle("TOP- Fare and Sex, BOTTOM- Fare and Survival -- Separated by Class (1st class omits top 10% of data)") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
legend.position = c(0.84, 0.7),
title = element_text(size=11, face="bold"))
b2 <- ggplot(data=ttr2) +
geom_boxplot(mapping = aes(y = Fare, fill=as.factor(Survived))) +
facet_wrap(~ Pclass) +
ylim(0, quantile(ttr2$Fare[ttr2$Pclass==1], 0.9)) +
guides(fill=guide_legend(title = "Survived")) +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
legend.position = c(0.83, 0.7))
grid.arrange(b1, b2)
# histograms
h1 <- ggplot(data=ttr2)+
geom_histogram(aes(x = ttr2$Age, fill = ttr2$Survived==1), alpha=0.7) +
facet_wrap(~ Pclass, nrow=3)+
guides(fill=guide_legend(title = "Survived")) +
xlab("Age") +
ggtitle("Survival by Age and Class") +
scale_x_continuous(breaks=scales::pretty_breaks(n = 5)) +
scale_y_continuous(breaks=scales::pretty_breaks(n = 10)) +
theme(legend.position = c(0.15, 0.9),
axis.text=element_text(size=12),
axis.title=element_text(size=12,face="bold"),
title = element_text(size=18, face="bold"))
h2 <- ggplot(data=ttr2)+
geom_histogram(aes(x = ttr2$Age, fill = ttr2$Survived==1), alpha=0.7) +
facet_wrap(~ Sex)+
guides(fill=guide_legend(title = "Survived")) +
xlab("Age") +
ggtitle("Survival by Age and Sex")+
scale_x_continuous(breaks=scales::pretty_breaks(n = 5)) +
scale_y_continuous(breaks=scales::pretty_breaks(n = 10)) +
theme(legend.position = c(0.85, 0.9),
axis.text=element_text(size=12),
axis.title=element_text(size=12,face="bold"),
title = element_text(size=18, face="bold"))
grid.arrange(h1,h2, ncol=2)
Variable Consideration (Final selection determined further below)
I will keep ‘PassingerId’ handy, but will not use it in any model.
‘Survival’ will certainly be used in all models. It is the variable we are trying to predict.
‘Pclass’ will be considered. The above graphics suggest that class has an effect on survival.
‘Name’ will not be included. Using ‘PassingerId’ will be sufficient here.
‘Sex’ will be considered in my models. There appears to be significant effect on survival based on sex.
‘Age’ will be considered. It appears that younger passengers were prioritized.
‘SibSp’, the # of siblings/spouses aboard, will be considered.
‘Parch’, the # of parents/children aboard, will be considered.
‘Ticket’ will not be used. The type of ticket does not influence outcome in a way not otherwise covered.
‘Fare,’ surprisingly, will be considered. The above graphics suggest that those who spent the most on their ticket saw a higher likelyhood of surviving.
‘Cabin’ will not be used. Same as with ‘Ticket,’ it does not provide additional/useful information not already included.
‘Embarked’ will not be used. The port the passenger embarked from makes no difference.
data <- ttr2[,-c(4,9,11,12)]
#=============================================================================== Functions
numericising_sex <- function(sex){
if(sex == 'male'){
num = 0
}else if(sex == 'female'){
num = 1
}
return(num)
}
predict.regsubsets <- function(object, newdata, id, ...){
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
tcoef <- coef(object, id = id)
tvars <- names(tcoef)
mat[,tvars] %*% tcoef
}
# ==========================================================================#--
for (i in 1:nrow(data)){
num = as.numeric(numericising_sex(data[i,4]))
data[i,4] = num
}
data$Sex <- as.numeric(data$Sex)
Stepwise Selection
library(leaps)
regfit.full <- regsubsets(Survived~ ., data=data, nvmax=7)
summary(regfit.full)
Subset selection object
Call: regsubsets.formula(Survived ~ ., data = data, nvmax = 7)
7 Variables (and intercept)
Forced in Forced out
PassengerId FALSE FALSE
Pclass FALSE FALSE
Sex FALSE FALSE
Age FALSE FALSE
SibSp FALSE FALSE
Parch FALSE FALSE
Fare FALSE FALSE
1 subsets of each size up to 7
Selection Algorithm: exhaustive
PassengerId Pclass Sex Age SibSp Parch Fare
1 ( 1 ) " " " " "*" " " " " " " " "
2 ( 1 ) " " "*" "*" " " " " " " " "
3 ( 1 ) " " "*" "*" "*" " " " " " "
4 ( 1 ) " " "*" "*" "*" "*" " " " "
5 ( 1 ) " " "*" "*" "*" "*" " " "*"
6 ( 1 ) " " "*" "*" "*" "*" "*" "*"
7 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
regfit.fwd <- regsubsets(Survived ~ ., data = data, nvmax = 7, method = "forward")
summary(regfit.fwd)
Subset selection object
Call: regsubsets.formula(Survived ~ ., data = data, nvmax = 7, method = "forward")
7 Variables (and intercept)
Forced in Forced out
PassengerId FALSE FALSE
Pclass FALSE FALSE
Sex FALSE FALSE
Age FALSE FALSE
SibSp FALSE FALSE
Parch FALSE FALSE
Fare FALSE FALSE
1 subsets of each size up to 7
Selection Algorithm: forward
PassengerId Pclass Sex Age SibSp Parch Fare
1 ( 1 ) " " " " "*" " " " " " " " "
2 ( 1 ) " " "*" "*" " " " " " " " "
3 ( 1 ) " " "*" "*" "*" " " " " " "
4 ( 1 ) " " "*" "*" "*" "*" " " " "
5 ( 1 ) " " "*" "*" "*" "*" " " "*"
6 ( 1 ) " " "*" "*" "*" "*" "*" "*"
7 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
regfit.bwd <- regsubsets(Survived ~ ., data = data, nvmax = 7, method = "backward")
summary(regfit.bwd)
Subset selection object
Call: regsubsets.formula(Survived ~ ., data = data, nvmax = 7, method = "backward")
7 Variables (and intercept)
Forced in Forced out
PassengerId FALSE FALSE
Pclass FALSE FALSE
Sex FALSE FALSE
Age FALSE FALSE
SibSp FALSE FALSE
Parch FALSE FALSE
Fare FALSE FALSE
1 subsets of each size up to 7
Selection Algorithm: backward
PassengerId Pclass Sex Age SibSp Parch Fare
1 ( 1 ) " " " " "*" " " " " " " " "
2 ( 1 ) " " "*" "*" " " " " " " " "
3 ( 1 ) " " "*" "*" "*" " " " " " "
4 ( 1 ) " " "*" "*" "*" "*" " " " "
5 ( 1 ) " " "*" "*" "*" "*" " " "*"
6 ( 1 ) " " "*" "*" "*" "*" "*" "*"
7 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
Regarless of the direction in which Stepwise selection is performed, the order of the variables is not affected.
regfit.full <- regsubsets(Survived~., data=data)
reg.summary <- summary(regfit.full)
reg.summary$rsq
[1] 0.2952307 0.3676802 0.3833686 0.3932607 0.3941012 0.3949291 0.3949574
R2 adjusted plot
par(mfrow = c(1,2))
plot(reg.summary$rss, xlab = "# vars", ylab = "RSS", type = 'l')
plot(reg.summary$adjr2, xlab = "# vars", ylab = "R2 adj", type = 'o')
max.r2 <- which.max(reg.summary$adjr2)
points(max.r2, reg.summary$adjr2[max.r2], col = "red", cex = 2, pch = 20)
par(mfrow = c(1,1))
Mallows cp and BIC plots
par(mfrow = c(2,2))
plot(reg.summary$cp, xlab = "# vars", ylab = "Cp", type = 'l')
max.cp <- which.max(reg.summary$cp)
points(max.cp, reg.summary$cp[max.cp], col = "red", cex = 2, pch = 20)
min.cp <- which.min(reg.summary$cp)
points(min.cp, reg.summary$cp[min.cp], col = "red", cex = 2, pch = 20)
##
plot(reg.summary$bic, xlab = "# vars", ylab = "BIC", type = "l")
min.BIC <- which.min(reg.summary$bic)
points(min.BIC, reg.summary$bic[min.BIC], col = "red", cex = 2, pch = 20)
##
plot(regfit.full, scale = 'Cp')
plot(regfit.full, scale = 'bic')
coef(regfit.full, 7)
(Intercept) PassengerId Pclass Sex Age
7.823233e-01 1.010241e-05 -1.699086e-01 5.126161e-01 -5.873414e-03
SibSp Parch Fare
-4.322220e-02 -2.004511e-02 4.131417e-04
Splitting into training and testing datasets
set.seed(1)
sample(c(1:10), replace=T)
[1] 9 4 7 1 2 7 2 3 1 5
train <- sample(c(TRUE, FALSE), nrow(data), replace=TRUE)
table(train)
train
FALSE TRUE
446 445
test <- (!train)
regfit.best <- regsubsets(Survived~., data=data[test,])
test.mat <- model.matrix(Survived~., data=data[test,])
val.error <- rep(NA, 7) # might need to change this 19
for(i in c(1:7)){
tcoef <- coef(regfit.best, id = i)
pred <- test.mat[,names(tcoef)] %*% tcoef
val.error[i] <- mean((data$Survived[test]-pred)^2)
}
wmin <- which.min(val.error)
coef(regfit.best, wmin)
(Intercept) PassengerId Pclass Sex Age
8.275234e-01 1.944594e-05 -1.949507e-01 4.931301e-01 -5.383433e-03
SibSp Parch Fare
-5.214669e-02 -3.316540e-03 2.981997e-04
Cross-Validation
k <- 10
set.seed(123)
folds <- sample(1:k, nrow(data), replace=T)
table(folds)
folds
1 2 3 4 5 6 7 8 9 10
75 81 96 80 79 81 108 96 96 99
cv.errors <- matrix(NA, nrow = k, ncol = 7,
dimnames=list(NULL, paste(1:7)))
cv.errors
1 2 3 4 5 6 7
[1,] NA NA NA NA NA NA NA
[2,] NA NA NA NA NA NA NA
[3,] NA NA NA NA NA NA NA
[4,] NA NA NA NA NA NA NA
[5,] NA NA NA NA NA NA NA
[6,] NA NA NA NA NA NA NA
[7,] NA NA NA NA NA NA NA
[8,] NA NA NA NA NA NA NA
[9,] NA NA NA NA NA NA NA
[10,] NA NA NA NA NA NA NA
for(j in c(1:k)){
best.fit <- regsubsets(Survived ~ ., data=data[folds != j], nvmax=8)
for(i in c(1:7)){
pred <- predict.regsubsets(best.fit, data[folds == j,], id=i)
cv.errors[j,i] <- mean((data$Survived[folds == j] - pred)^2)
}
}
mean.cv.errors <- apply(cv.errors, MARGIN = 2, mean)
mean.cv.errors
1 2 3 4 5 6 7
0.1670771 0.1504528 0.1470566 0.1452590 0.1456742 0.1455704 0.1457004
par(mfrow=c(1,1))
plot(mean.cv.errors,type='b')
tmin <- which.min(mean.cv.errors)
points(tmin, mean.cv.errors[tmin], col='red')
reg.best <- regsubsets(Survived ~., data = data, nvmax=8)
coef(reg.best, tmin)
(Intercept) Pclass Sex Age SibSp
0.825884655 -0.183741125 0.509350502 -0.005846433 -0.045356182
These four variables are the selected finals.
The chosen models are 1. K-Nearest Neighbor and 2. Multinomial Logistic Model (softmax). Since the four variables do not follow a normal distribution, LDA and QDA were ruled out. I don’t have any expectations. If I had to guess, I would say that softmax will out perform KNN.
# ================================================================================= Splitting Data
library(caret)
set.seed(12)
indexes <- createDataPartition(ttr2$Survived, p=0.90, list=F)
train <- ttr2[indexes,]
test <- ttr2[-indexes,]
# ================================================================================= Train data prep
train_x <- train[,-c(2,4,9,11,12)]
###
for (i in 1:nrow(train)){
num = as.numeric(numericising_sex(train_x[i,3]))
train_x[i,3] = num
}
train_x$Sex <- as.numeric(train_x$Sex)
###
train_x <- scale(train_x)[,]
train_y <- as.numeric(unlist(train[,2]))
# ================================================================================= Test data prep
test_x <- test[,-c(2,4,9,11,12)]
###
for (i in 1:nrow(test)){
num = as.numeric(numericising_sex(test_x[i,3]))
test_x[i,3] = num
}
test_x$Sex <- as.numeric(test_x$Sex)
###
test_x <- scale(test_x)[,]
test_y <- as.numeric(unlist(test[,2]))
# ================================================================================= Model Creation
knnmodel = knnreg(train_x, train_y)
str(knnmodel)
List of 3
$ learn :List of 2
..$ y: num [1:802] 0 1 1 1 0 0 0 0 1 1 ...
..$ X: num [1:802, 1:7] -1.73 -1.73 -1.72 -1.72 -1.72 ...
.. ..- attr(*, "dimnames")=List of 2
.. .. ..$ : NULL
.. .. ..$ : chr [1:7] "PassengerId" "Pclass" "Sex" "Age" ...
$ k : num 5
$ theDots: list()
- attr(*, "class")= chr "knnreg"
pred_y <- predict(knnmodel, data.frame(test_x))
#print(data.frame(test_y, pred_y))
mse = mean((test_y - pred_y)^2)
mae = caret::MAE(test_y, pred_y)
rmse = caret::RMSE(test_y, pred_y)
cat("MSE:", mse, " MAE:", mae, " RMSE:", rmse)
MSE: 0.1451685 MAE: 0.2629213 RMSE: 0.3810099
# ================================================================================= Plot
x = 1:length(test_y)
compare <- data.frame(test_y, round(pred_y))
ggplot(data=compare) +
geom_line(mapping=aes(x=x, y=test_y), lwd = 2, color="red") +
geom_line(mapping=aes(x=x, y=pred_y), lwd = 1.5, color="blue", alpha=0.5) +
ggtitle("Red - Predictions, Blue - Actual")+
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "none")
Table Comparisons and AUC (KNN)
# ================================================================================= Evaluation
p2 <- predict(knnmodel, data.frame(train_x))
compare.train <- data.frame(train_y, round(p2))
tab2 <- table(compare.train)
tab2
round.p2.
train_y 0 1
0 438 50
1 67 247
sum(diag(tab2))/sum(tab2) # predictions on training data
[1] 0.8541147
tab <- table(compare)
tab
round.pred_y.
test_y 0 1
0 50 11
1 6 22
sum(diag(tab))/sum(tab) # predictions on test data
[1] 0.8089888
library(pROC)
auc(compare.train$train_y, compare.train$round.p2.) # Training Set
Area under the curve: 0.8421
auc(compare$test_y, compare$round.pred_y.) # Test Set
Area under the curve: 0.8027
This model over fits by about 4%.
# ================================================================================ Functions
final.convert <- function(val, threshold){
if(val >= logCost){
decided = 1
}else{
decided = 0
}
return(decided)
}
softmax <- function(par){
n.par <- length(par)
par1 <- sort(par, decreasing = TRUE)
Lk <- par1[1]
for (k in 1:(n.par-1)) {
Lk <- max(par1[k+1], Lk) + log1p(exp(-abs(par1[k+1] - Lk)))
}
val <- exp(par - Lk)
return(val)
}
# ================================================================================= Splitting Data
indexes <- createDataPartition(ttr2$Survived, p=0.90, list=F)
train <- data[indexes,]
test <- data[-indexes,]
attach(train)
X <- cbind(Pclass, Sex, Age, SibSp)
Y <- Survived
logit <- glm(Y~ X, family = binomial (link="logit"))
summary(logit)
Call:
glm(formula = Y ~ X, family = binomial(link = "logit"))
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6800 -0.5823 -0.4124 0.6056 2.4524
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.402315 0.450224 5.336 9.51e-08 ***
XPclass -1.187602 0.127179 -9.338 < 2e-16 ***
XSex 2.755420 0.205882 13.384 < 2e-16 ***
XAge -0.038220 0.008202 -4.660 3.17e-06 ***
XSibSp -0.330431 0.108109 -3.056 0.00224 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1067.33 on 801 degrees of freedom
Residual deviance: 706.03 on 797 degrees of freedom
AIC: 716.03
Number of Fisher Scoring iterations: 5
l1 <- predict(logit, type="response")
s.max <- softmax(l1)
compare <- data.frame(train$Survived, s.max)
eval <- data.frame(train$Survived, l1)
# ================================================================================= Fitting Test Set
attach(test)
X <- cbind(Pclass, Sex, Age, SibSp)
Y <- Survived
l2 <- predict(logit, test, type="response")
final.compare <- data.frame(test$Survived, l2)
# ================================================================================= Cost Function
cost = 0
for(i in 1:nrow(compare)){
current = compare[i,1] * compare[i,2]
cost = cost + current
}
logCost <- -(log(cost))
Table Comparisons and AUC (Softmax)
for(i in 1:nrow(final.compare)){
final = final.convert(final.compare[i,2], logCost)
final.compare[i,2] = final
}
for(i in 1:nrow(eval)){
final = final.convert(eval[i,2], logCost)
eval[i,2] = final
}
tab1 <- table(eval)
tab1
l1
train.Survived 0 1
0 485 10
1 161 146
sum(diag(tab1))/sum(tab1) # predictions on training data
[1] 0.786783
tab7 <- table(final.compare)
tab7
l2
test.Survived 0 1
0 54 0
1 17 18
sum(diag(tab7))/sum(tab7) # predictions on test data
[1] 0.8089888
auc(eval$train.Survived, eval$l1) # Training Set
Area under the curve: 0.7277
auc(final.compare$test.Survived, final.compare$l2) # Test Set
Area under the curve: 0.7571
This model could learn more with some tweaks.
The KNN model has a higher AUC than the Softmax model, with slightly more variation between its train and test sets. This difference occurs when overfitting is present so there is definitely room for improvement. The Softmax model likely has overfitting present based on its AUC scores too, but slightly less than the KNN model.
When comparing model predictions against each other, the Softmax accurately predicts a smaller proportion on its training set than it does on its test set. In contrast, the KNN model has a difference of almost 4% between its train and test set predictions with its train set being higher. This suggests that the model thinks it knows more than it actually does when handling unseen data. If both models increased their prediction performance on their training sets, the Softmax model would have more room for improvement.
Given that both models score almost identically when predicting un-seen data, I think I would put my money on the Softmax model. Given the potential room for improvement when comparing both models, I would put my money on the Softmax model at this time. There are certainly more tweaks that could be performed to find an optimal train/test split among other methods for improvement. The Softmax model is simply the more precise model so that is my choice.