library(data.table)
library(knitr)

Online Rubric (OR): 1

Load the titanic dataset.

# I organize my project folders according to the website below. I can email you this project directory. 
# https://martinctc.github.io/blog/rstudio-projects-and-working-directories-a-beginner's-guide/
#
# to run this, a specific "Project" folder must contain a "Data" folder
# the "Data" folder must contain a "raw" and "r" folder as its been provided in our class scripts
#
# .Rmd files are weird in projects. This file will exist in the "Project" folder, on the same level
# as the "Data" folder. 
#
# this is all done according to the website above.
#
# email me at selfes@bc.edu and I can send the entire project folder.

data <- "Data"
dir.exists(data)

[1] TRUE

rawData <- file.path(data, "raw")
rData <- file.path(data, "r")
list.files(rawData)

[1] "Titanic_test.csv"  "Titanic_train.csv"

ttr <- read.csv(file.path(rawData, "Titanic_train.csv"), stringsAsFactors = F)
object.size(ttr)

193176 bytes

# Save Titanic training dataset in RDS format
saveRDS(ttr, file.path(rData, "ttr.rds"))

# Save Titanic training dataset in fst format
library(fst)
write_fst(ttr, file.path(rData, "ttr.fst"))

ttr2 <- data.table(ttr)

OR 2

Investigate the missing values, and decide what to do with them

colnames(ttr2)

 [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"        
 [6] "Age"         "SibSp"       "Parch"       "Ticket"      "Fare"       
[11] "Cabin"       "Embarked"

library(Amelia)
library(Hmisc)
missmap(ttr2)

#sapply(ttr2, function(x) sum(is.na(x))) # what are we iterating over? columns of ttr2
colSums(is.na(ttr2))

PassengerId    Survived      Pclass        Name         Sex         Age 
          0           0           0           0           0         177 
      SibSp       Parch      Ticket        Fare       Cabin    Embarked 
          0           0           0           0           0           0

# Filtering by 'Age' for bar plots below
library(dplyr)
all_missing <- ttr2 %>% filter(is.na(ttr2$Age))
all_but_missing <- ttr2 %>% filter(! is.na(ttr2$Age))

ggplot(data=ttr2) + 
  geom_histogram(mapping=aes(x=Age),col="black")

# Replacing NA's with mean value
mean(ttr2$Age, na.rm=T)

[1] 29.69912

ttr2$Age <- as.numeric(impute(ttr2$Age, mean))

The only missing values are for the variable ‘Age.’ Given the above histogram, a reasonable way to handle the missing values will be to replace them with the mean.

The original dataset was split into all_missing, which contains only the observations with missing ‘Age’ values, and all_but_missing, which contains only the observations with no missing values. These two subsets are used to create the following two barplots below. The top is of all_missing and the bottom is all_but_missing. For both of these plots, the left plot (0) shows those who died and the right (1) shows those who lived.

library(ggplot2)
library(gridExtra)

sum(all_missing$Sex == "male")

[1] 124

sum(all_missing$Sex == "female")

[1] 53

ggplot(data=all_missing) + 
  geom_bar(mapping = aes(x=Pclass, fill=Sex), binwidth = 0.5, col="black") + 
  xlab("Class") + ggtitle("all_missing -- Class, Sex, Survived") + 
  scale_x_continuous(breaks=scales::pretty_breaks(n = 3)) +
  scale_y_continuous(breaks=scales::pretty_breaks(n = 15)) +
  facet_wrap(~ Survived) + 
  theme(axis.text=element_text(size=12),
        axis.title=element_text(size=12,face="bold"),
        legend.position = c(0.6, 0.9))

ggplot(data=all_but_missing) + 
  geom_bar(mapping = aes(x=Pclass, fill=Sex), binwidth = 0.5, col="black") + 
  xlab("Class") + ggtitle("all_but_missing -- Class, Sex, Survived") + 
  scale_x_continuous(breaks=scales::pretty_breaks(n = 3)) +
  scale_y_continuous(breaks=scales::pretty_breaks(n = 15)) +
  facet_wrap(~ Survived) + 
  theme(axis.text=element_text(size=12),
        axis.title=element_text(size=12,face="bold"),
        legend.position = c(0.6, 0.9))

OR 3

For each variable, indicate whether it is numeric or categorical; if the latter, is it ordinal or not.

# PassengerId   - Qualitative, Ordinal
# Survived  -   - Qualitative, Nominal
# Pclass-   -   - Qualitative, Ordinal
# Name  -   -   - Qualitative, Nominal
# Sex   -   -   - Qualitative, Nominal
# Age   -   -   - Quantitative, Ratio
# SibSp -   -   - Qualitative, Ordinal
# Parch -   -   - Quantitative, Ratio
# Ticket-   -   - Qualitative, Nominal
# Fare  -   -   - Quantitative, Interval 
# Cabin -   -   - Qualitative, Nominal
# Embarked  -   - Qualitative, Nominal

OR 4

If the variable is numeric, list the mean, standard deviation, minimum, maximum, plot a histogram.

# Age
summary(ttr2$Age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.42   22.00   29.70   29.70   35.00   80.00

ggplot(data=ttr2) + 
  geom_histogram(mapping=aes(x=Age), fill="cornflowerblue", col="black") + 
  ggtitle("AGE") +
  theme(axis.text=element_text(size=12),
        axis.title=element_text(size=12,face="bold"))

# Parch
summary(ttr2$Parch)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.0000  0.3816  0.0000  6.0000

ggplot(data=ttr2) + 
  geom_histogram(mapping=aes(x=Parch), fill="lightgreen", binwidth=1, col="black") + 
  ggtitle("PARCH") +
  theme(axis.text=element_text(size=12),
        axis.title=element_text(size=12,face="bold"))

# Fare
summary(ttr2$Fare)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    7.91   14.45   32.20   31.00  512.33

ggplot(data=ttr2) + 
  geom_histogram(mapping=aes(x=Fare), fill="chartreuse1", col="black") + 
  ggtitle("FARE") +
  theme(axis.text=element_text(size=12),
        axis.title=element_text(size=12,face="bold"))

OR 5

If the variable is categorical, identify the number of distinct levels (values), and list the top 5 in descending order of frequency.

### ALL CATEGORICAL VARIABLES ---------
#
#
# PassengerId
# Survived
# Pclass
# Name
# Sex
# SibSp
# Ticket
# Cabin
# Embarked  
#
#
#
### -----------------------------------




# PassengerId
###############
#
#
# This command counts the amount of unique values present for any variable
length(unique(ttr2$PassengerId))

[1] 891

#
#
# This finds the intersection of any observation in either comparison -- effectively does 
# the same thing as the above command. 
intersection <- ttr2$PassengerId %in% ttr2$PassengerId

sum(intersection == FALSE) # there are no two observations that're the same for PassengerId

[1] 0

sum(intersection == TRUE)  # this means that every single observation is unique

[1] 891

#
#
# This command shows the 5 highest observations, after sorting all in descending order. 
head(order(-ttr2$PassengerId), 5)

[1] 891 890 889 888 887

#
#
# This command finds the associated observation value associated with the descending ordered
# rank found above. It is better seen below with the variable 'Survived'. 
ttr2$PassengerId[head(order(-ttr2$PassengerId), 5)]

[1] 891 890 889 888 887

# Survived
##############
#
#
# These are the exact commands as above. The rest are automated below. 
length(unique(ttr2$Survived))

[1] 2

head(order(-ttr2$Survived)) # these are the index locations of the sorted values which must be swapped.

[1]  2  3  4  9 10 11

ttr2$Survived[head(order(-ttr2$Survived), 5)]

[1] 1 1 1 1 1

#
#
#
# Pclass
# Name
# Sex
# SibSp
# Ticket
# Cabin
# Embarked  
#
#
#




#########################
# The following for-loop iterates and describes all variables - not just the categorical ones. 
# The same essential techniques shown above are used in this loop, with the exception of the 
# if-statement, which determines if the most frequently occurring values are repeated. 
#
#
# In the output, the 'five most frequently occurring values' are synced with the line below, 
# which tells how many occurrences there are for each. 
#########################

i = 1
for (i in 1:ncol(ttr2)){
  vb.name <- colnames(ttr2[,..i])
  unq.val <- nrow(unique(ttr2[,..i]))
  
      ####
      top.five <- as.numeric(unlist(ttr2[,..i][head(order(-ttr2[,..i]),5)]))
      occur <- as.numeric(unlist(head(sort(table(ttr2[,..i]), decreasing=T),5)))
      if (sum(head(sort(table(ttr2[,..i]), decreasing=T),5)) == 5){   # sum == 5, each value unique
        five.freq = "NA"
      }else {
        five.freq <- names(head(sort(table(ttr2[,..i]), decreasing=T),5))
      }
      ####
      
  out <- paste0("The variable '", vb.name, "' has ", unq.val, " unique values.")
  out2 <- paste0("The five highest values are: ", top.five[1], ", ", top.five[2], ", ", top.five[3],
                 ", ", top.five[4], ", ", top.five[5])
  out3 <- paste0("The five most frequently occuring values are: ", five.freq[1], ", ", five.freq[2],
                 ", ", five.freq[3],", ", five.freq[4], ", ", five.freq[5])
  out4 <- paste0("The amount of times these values occur are  : ", occur[1], ", ", occur[2],
                 ", ", occur[3],", ", occur[4], ", ", occur[5])
  print(out)
  print(out3)
  print(out4)
  print(out2)
  
  print(" ")
  print(" ")
  print(" ")


  i = i + 1
}

[1] "The variable 'PassengerId' has 891 unique values."
[1] "The five most frequently occuring values are: NA, NA, NA, NA, NA"
[1] "The amount of times these values occur are  : 1, 1, 1, 1, 1"
[1] "The five highest values are: 891, 890, 889, 888, 887"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Survived' has 2 unique values."
[1] "The five most frequently occuring values are: 0, 1, NA, NA, NA"
[1] "The amount of times these values occur are  : 549, 342, NA, NA, NA"
[1] "The five highest values are: 1, 1, 1, 1, 1"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Pclass' has 3 unique values."
[1] "The five most frequently occuring values are: 3, 1, 2, NA, NA"
[1] "The amount of times these values occur are  : 491, 216, 184, NA, NA"
[1] "The five highest values are: 3, 3, 3, 3, 3"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Name' has 891 unique values."
[1] "The five most frequently occuring values are: NA, NA, NA, NA, NA"
[1] "The amount of times these values occur are  : 1, 1, 1, 1, 1"
[1] "The five highest values are: NA, NA, NA, NA, NA"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Sex' has 2 unique values."
[1] "The five most frequently occuring values are: male, female, NA, NA, NA"
[1] "The amount of times these values occur are  : 577, 314, NA, NA, NA"
[1] "The five highest values are: NA, NA, NA, NA, NA"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Age' has 89 unique values."
[1] "The five most frequently occuring values are: 29.6991176470588, 24, 22, 18, 19"
[1] "The amount of times these values occur are  : 177, 30, 27, 26, 25"
[1] "The five highest values are: 80, 74, 71, 71, 70.5"
[1] " "
[1] " "
[1] " "
[1] "The variable 'SibSp' has 7 unique values."
[1] "The five most frequently occuring values are: 0, 1, 2, 4, 3"
[1] "The amount of times these values occur are  : 608, 209, 28, 18, 16"
[1] "The five highest values are: 8, 8, 8, 8, 8"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Parch' has 7 unique values."
[1] "The five most frequently occuring values are: 0, 1, 2, 3, 5"
[1] "The amount of times these values occur are  : 678, 118, 80, 5, 5"
[1] "The five highest values are: 6, 5, 5, 5, 5"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Ticket' has 681 unique values."
[1] "The five most frequently occuring values are: 1601, 347082, CA. 2343, 3101295, 347088"
[1] "The amount of times these values occur are  : 7, 7, 7, 6, 6"
[1] "The five highest values are: NA, NA, NA, NA, NA"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Fare' has 248 unique values."
[1] "The five most frequently occuring values are: 8.05, 13, 7.8958, 7.75, 26"
[1] "The amount of times these values occur are  : 43, 42, 38, 34, 31"
[1] "The five highest values are: 512.3292, 512.3292, 512.3292, 263, 263"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Cabin' has 148 unique values."
[1] "The five most frequently occuring values are: , B96 B98, C23 C25 C27, G6, C22 C26"
[1] "The amount of times these values occur are  : 687, 4, 4, 4, 3"
[1] "The five highest values are: NA, NA, NA, NA, NA"
[1] " "
[1] " "
[1] " "
[1] "The variable 'Embarked' has 4 unique values."
[1] "The five most frequently occuring values are: S, C, Q, , NA"
[1] "The amount of times these values occur are  : 644, 168, 77, 2, NA"
[1] "The five highest values are: NA, NA, NA, NA, NA"
[1] " "
[1] " "
[1] " "

OR 6

For each variable, decide based on your inspection whether you will use it or not in your model(s)

Plots included here.

# box-plots


ggplot(data = ttr2) + 
  geom_boxplot(mapping = aes(y=Age, fill=as.factor(Survived))) + 
  facet_wrap(~ Sex) + 
  ggtitle("Boxplot of Age and Survival -- Separated by Sex (0 = died, 1 = survived)")+
  guides(fill=guide_legend(title = "Survived")) + 
  theme(legend.position = c(0.2, 0.9),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        title = element_text(size=15, face="bold")) + 
  scale_y_continuous(breaks=scales::pretty_breaks(n = 15))

b1 <- ggplot(data=ttr2) + 
        geom_boxplot(mapping = aes(y=Fare, fill=Sex)) + 
        facet_wrap(~ Pclass) + 
        ylim(0, quantile(ttr2$Fare[ttr2$Pclass==1], 0.9)) + 
        ggtitle("TOP- Fare and Sex,   BOTTOM- Fare and Survival -- Separated by Class (1st class omits top 10% of data)") +
        theme(axis.text.x = element_blank(),
              axis.ticks.x = element_blank(),
              legend.position = c(0.84, 0.7),
              title = element_text(size=11, face="bold"))

b2 <- ggplot(data=ttr2) + 
        geom_boxplot(mapping = aes(y = Fare, fill=as.factor(Survived))) + 
        facet_wrap(~ Pclass) + 
        ylim(0, quantile(ttr2$Fare[ttr2$Pclass==1], 0.9)) + 
        guides(fill=guide_legend(title = "Survived")) +
        theme(axis.text.x = element_blank(),
              axis.ticks.x = element_blank(),
              legend.position = c(0.83, 0.7))


grid.arrange(b1, b2)

# histograms
h1 <- ggplot(data=ttr2)+
  geom_histogram(aes(x = ttr2$Age, fill = ttr2$Survived==1), alpha=0.7) + 
  facet_wrap(~ Pclass, nrow=3)+
  guides(fill=guide_legend(title = "Survived")) + 
  xlab("Age") + 
  ggtitle("Survival by Age and Class") + 
  scale_x_continuous(breaks=scales::pretty_breaks(n = 5)) +
  scale_y_continuous(breaks=scales::pretty_breaks(n = 10)) +
  theme(legend.position = c(0.15, 0.9),
        axis.text=element_text(size=12),
        axis.title=element_text(size=12,face="bold"),
        title = element_text(size=18, face="bold"))

h2 <- ggplot(data=ttr2)+
  geom_histogram(aes(x = ttr2$Age, fill = ttr2$Survived==1), alpha=0.7) + 
  facet_wrap(~ Sex)+
  guides(fill=guide_legend(title = "Survived")) + 
  xlab("Age") + 
  ggtitle("Survival by Age and Sex")+ 
  scale_x_continuous(breaks=scales::pretty_breaks(n = 5)) +
  scale_y_continuous(breaks=scales::pretty_breaks(n = 10)) +
  theme(legend.position = c(0.85, 0.9),
        axis.text=element_text(size=12),
        axis.title=element_text(size=12,face="bold"),
        title = element_text(size=18, face="bold"))

grid.arrange(h1,h2, ncol=2)

Variable Consideration (Final selection determined further below)

I will keep ‘PassingerId’ handy, but will not use it in any model.
‘Survival’ will certainly be used in all models. It is the variable we are trying to predict.
‘Pclass’ will be considered. The above graphics suggest that class has an effect on survival.
‘Name’ will not be included. Using ‘PassingerId’ will be sufficient here.
‘Sex’ will be considered in my models. There appears to be significant effect on survival based on sex.
‘Age’ will be considered. It appears that younger passengers were prioritized.
‘SibSp’, the # of siblings/spouses aboard, will be considered.
‘Parch’, the # of parents/children aboard, will be considered.
‘Ticket’ will not be used. The type of ticket does not influence outcome in a way not otherwise covered.
‘Fare,’ surprisingly, will be considered. The above graphics suggest that those who spent the most on their ticket saw a higher likelyhood of surviving.
‘Cabin’ will not be used. Same as with ‘Ticket,’ it does not provide additional/useful information not already included.
‘Embarked’ will not be used. The port the passenger embarked from makes no difference.

Data Prep / Final Variable Selection

data <- ttr2[,-c(4,9,11,12)]

#=============================================================================== Functions
numericising_sex <- function(sex){
  if(sex == 'male'){
    num = 0
  }else if(sex == 'female'){
    num = 1
  }
  return(num)
}

predict.regsubsets <- function(object, newdata, id, ...){
  form <- as.formula(object$call[[2]])
  mat <- model.matrix(form, newdata)
  tcoef <- coef(object, id = id)
  tvars <- names(tcoef)
  mat[,tvars] %*% tcoef
}
# ==========================================================================#--


for (i in 1:nrow(data)){
  num = as.numeric(numericising_sex(data[i,4]))
  data[i,4] = num
}
data$Sex <- as.numeric(data$Sex)

Stepwise Selection

library(leaps)

regfit.full <- regsubsets(Survived~ ., data=data, nvmax=7)
summary(regfit.full)

Subset selection object
Call: regsubsets.formula(Survived ~ ., data = data, nvmax = 7)
7 Variables  (and intercept)
            Forced in Forced out
PassengerId     FALSE      FALSE
Pclass          FALSE      FALSE
Sex             FALSE      FALSE
Age             FALSE      FALSE
SibSp           FALSE      FALSE
Parch           FALSE      FALSE
Fare            FALSE      FALSE
1 subsets of each size up to 7
Selection Algorithm: exhaustive
         PassengerId Pclass Sex Age SibSp Parch Fare
1  ( 1 ) " "         " "    "*" " " " "   " "   " " 
2  ( 1 ) " "         "*"    "*" " " " "   " "   " " 
3  ( 1 ) " "         "*"    "*" "*" " "   " "   " " 
4  ( 1 ) " "         "*"    "*" "*" "*"   " "   " " 
5  ( 1 ) " "         "*"    "*" "*" "*"   " "   "*" 
6  ( 1 ) " "         "*"    "*" "*" "*"   "*"   "*" 
7  ( 1 ) "*"         "*"    "*" "*" "*"   "*"   "*"

regfit.fwd <- regsubsets(Survived ~ ., data = data, nvmax = 7, method = "forward")
summary(regfit.fwd)

Subset selection object
Call: regsubsets.formula(Survived ~ ., data = data, nvmax = 7, method = "forward")
7 Variables  (and intercept)
            Forced in Forced out
PassengerId     FALSE      FALSE
Pclass          FALSE      FALSE
Sex             FALSE      FALSE
Age             FALSE      FALSE
SibSp           FALSE      FALSE
Parch           FALSE      FALSE
Fare            FALSE      FALSE
1 subsets of each size up to 7
Selection Algorithm: forward
         PassengerId Pclass Sex Age SibSp Parch Fare
1  ( 1 ) " "         " "    "*" " " " "   " "   " " 
2  ( 1 ) " "         "*"    "*" " " " "   " "   " " 
3  ( 1 ) " "         "*"    "*" "*" " "   " "   " " 
4  ( 1 ) " "         "*"    "*" "*" "*"   " "   " " 
5  ( 1 ) " "         "*"    "*" "*" "*"   " "   "*" 
6  ( 1 ) " "         "*"    "*" "*" "*"   "*"   "*" 
7  ( 1 ) "*"         "*"    "*" "*" "*"   "*"   "*"

regfit.bwd <- regsubsets(Survived ~ ., data = data, nvmax = 7, method = "backward")
summary(regfit.bwd)

Subset selection object
Call: regsubsets.formula(Survived ~ ., data = data, nvmax = 7, method = "backward")
7 Variables  (and intercept)
            Forced in Forced out
PassengerId     FALSE      FALSE
Pclass          FALSE      FALSE
Sex             FALSE      FALSE
Age             FALSE      FALSE
SibSp           FALSE      FALSE
Parch           FALSE      FALSE
Fare            FALSE      FALSE
1 subsets of each size up to 7
Selection Algorithm: backward
         PassengerId Pclass Sex Age SibSp Parch Fare
1  ( 1 ) " "         " "    "*" " " " "   " "   " " 
2  ( 1 ) " "         "*"    "*" " " " "   " "   " " 
3  ( 1 ) " "         "*"    "*" "*" " "   " "   " " 
4  ( 1 ) " "         "*"    "*" "*" "*"   " "   " " 
5  ( 1 ) " "         "*"    "*" "*" "*"   " "   "*" 
6  ( 1 ) " "         "*"    "*" "*" "*"   "*"   "*" 
7  ( 1 ) "*"         "*"    "*" "*" "*"   "*"   "*"

Regarless of the direction in which Stepwise selection is performed, the order of the variables is not affected.

regfit.full <- regsubsets(Survived~., data=data)
reg.summary <- summary(regfit.full)

reg.summary$rsq

[1] 0.2952307 0.3676802 0.3833686 0.3932607 0.3941012 0.3949291 0.3949574

R2 adjusted plot

par(mfrow = c(1,2))
plot(reg.summary$rss, xlab = "# vars", ylab = "RSS", type = 'l')
plot(reg.summary$adjr2, xlab = "# vars", ylab = "R2 adj", type = 'o')

max.r2 <- which.max(reg.summary$adjr2)
points(max.r2, reg.summary$adjr2[max.r2], col = "red", cex = 2, pch = 20)

par(mfrow = c(1,1))

Mallows cp and BIC plots

par(mfrow = c(2,2))
plot(reg.summary$cp, xlab = "# vars", ylab = "Cp", type = 'l')
max.cp <- which.max(reg.summary$cp)
points(max.cp, reg.summary$cp[max.cp], col = "red", cex = 2, pch = 20)

min.cp <- which.min(reg.summary$cp)
points(min.cp, reg.summary$cp[min.cp], col = "red", cex = 2, pch = 20)

##

plot(reg.summary$bic, xlab = "# vars", ylab = "BIC", type = "l")
min.BIC <- which.min(reg.summary$bic)
points(min.BIC, reg.summary$bic[min.BIC], col = "red", cex = 2, pch = 20)

##

plot(regfit.full, scale = 'Cp')
plot(regfit.full, scale = 'bic')

coef(regfit.full, 7)

  (Intercept)   PassengerId        Pclass           Sex           Age 
 7.823233e-01  1.010241e-05 -1.699086e-01  5.126161e-01 -5.873414e-03 
        SibSp         Parch          Fare 
-4.322220e-02 -2.004511e-02  4.131417e-04

Splitting into training and testing datasets

set.seed(1)
sample(c(1:10), replace=T)

 [1] 9 4 7 1 2 7 2 3 1 5

train <- sample(c(TRUE, FALSE), nrow(data), replace=TRUE)
table(train)

train
FALSE  TRUE 
  446   445

test <- (!train)
regfit.best <- regsubsets(Survived~., data=data[test,])

test.mat <- model.matrix(Survived~., data=data[test,])
val.error <- rep(NA, 7) # might need to change this 19
for(i in c(1:7)){
  tcoef <- coef(regfit.best, id = i)
  pred <- test.mat[,names(tcoef)] %*% tcoef
  val.error[i] <- mean((data$Survived[test]-pred)^2)
}

wmin <- which.min(val.error)
coef(regfit.best, wmin)

  (Intercept)   PassengerId        Pclass           Sex           Age 
 8.275234e-01  1.944594e-05 -1.949507e-01  4.931301e-01 -5.383433e-03 
        SibSp         Parch          Fare 
-5.214669e-02 -3.316540e-03  2.981997e-04

Cross-Validation

k <- 10 
set.seed(123)
folds <- sample(1:k, nrow(data), replace=T)
table(folds)

folds
  1   2   3   4   5   6   7   8   9  10 
 75  81  96  80  79  81 108  96  96  99

cv.errors <- matrix(NA, nrow = k, ncol = 7,
  dimnames=list(NULL, paste(1:7)))
cv.errors

       1  2  3  4  5  6  7
 [1,] NA NA NA NA NA NA NA
 [2,] NA NA NA NA NA NA NA
 [3,] NA NA NA NA NA NA NA
 [4,] NA NA NA NA NA NA NA
 [5,] NA NA NA NA NA NA NA
 [6,] NA NA NA NA NA NA NA
 [7,] NA NA NA NA NA NA NA
 [8,] NA NA NA NA NA NA NA
 [9,] NA NA NA NA NA NA NA
[10,] NA NA NA NA NA NA NA

for(j in c(1:k)){
  best.fit <- regsubsets(Survived ~ ., data=data[folds != j], nvmax=8)
  for(i in c(1:7)){
    pred <- predict.regsubsets(best.fit, data[folds == j,], id=i)
    cv.errors[j,i] <- mean((data$Survived[folds == j] - pred)^2)
  }
}

mean.cv.errors <- apply(cv.errors, MARGIN = 2, mean)
mean.cv.errors

        1         2         3         4         5         6         7 
0.1670771 0.1504528 0.1470566 0.1452590 0.1456742 0.1455704 0.1457004

par(mfrow=c(1,1))
plot(mean.cv.errors,type='b')

tmin <- which.min(mean.cv.errors)
points(tmin, mean.cv.errors[tmin], col='red')

reg.best <- regsubsets(Survived ~., data = data, nvmax=8)
coef(reg.best, tmin)

 (Intercept)       Pclass          Sex          Age        SibSp 
 0.825884655 -0.183741125  0.509350502 -0.005846433 -0.045356182

These four variables are the selected finals.

OR 7

Select 2 of the models studied so far: … ; before you use them, note your expectation (if any) on performance (which one will “learn” better this data)

The chosen models are 1. K-Nearest Neighbor and 2. Multinomial Logistic Model (softmax). Since the four variables do not follow a normal distribution, LDA and QDA were ruled out. I don’t have any expectations. If I had to guess, I would say that softmax will out perform KNN.

OR 8

Train model #1 on the dataset; produce the probability predicitons for survival, then compute the AUC (area under the curve)

K-Nearest Neighbor

# ================================================================================= Splitting Data
library(caret)
set.seed(12)

indexes <- createDataPartition(ttr2$Survived, p=0.90, list=F)
train <- ttr2[indexes,]
test <- ttr2[-indexes,]

# ================================================================================= Train data prep
train_x <- train[,-c(2,4,9,11,12)]

###
for (i in 1:nrow(train)){
  num = as.numeric(numericising_sex(train_x[i,3]))
  train_x[i,3] = num
}
train_x$Sex <- as.numeric(train_x$Sex)
###
train_x <- scale(train_x)[,]
train_y <- as.numeric(unlist(train[,2]))

# ================================================================================= Test data prep
test_x <- test[,-c(2,4,9,11,12)]
###
for (i in 1:nrow(test)){
  num = as.numeric(numericising_sex(test_x[i,3]))
  test_x[i,3] = num
}
test_x$Sex <- as.numeric(test_x$Sex)
###
test_x <- scale(test_x)[,]
test_y <- as.numeric(unlist(test[,2]))

# ================================================================================= Model Creation
knnmodel = knnreg(train_x, train_y)
str(knnmodel)

List of 3
 $ learn  :List of 2
  ..$ y: num [1:802] 0 1 1 1 0 0 0 0 1 1 ...
  ..$ X: num [1:802, 1:7] -1.73 -1.73 -1.72 -1.72 -1.72 ...
  .. ..- attr(*, "dimnames")=List of 2
  .. .. ..$ : NULL
  .. .. ..$ : chr [1:7] "PassengerId" "Pclass" "Sex" "Age" ...
 $ k      : num 5
 $ theDots: list()
 - attr(*, "class")= chr "knnreg"

pred_y <- predict(knnmodel, data.frame(test_x))

#print(data.frame(test_y, pred_y))
mse = mean((test_y - pred_y)^2)
mae = caret::MAE(test_y, pred_y)
rmse = caret::RMSE(test_y, pred_y)

cat("MSE:", mse, " MAE:", mae, " RMSE:", rmse)

MSE: 0.1451685  MAE: 0.2629213  RMSE: 0.3810099

# ================================================================================= Plot
x = 1:length(test_y)

compare <- data.frame(test_y, round(pred_y))
ggplot(data=compare) +
  geom_line(mapping=aes(x=x, y=test_y), lwd = 2, color="red") + 
  geom_line(mapping=aes(x=x, y=pred_y), lwd = 1.5, color="blue", alpha=0.5) + 
  ggtitle("Red - Predictions, Blue - Actual")+
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        legend.position = "none")

Table Comparisons and AUC (KNN)

# ================================================================================= Evaluation
p2 <- predict(knnmodel, data.frame(train_x))
compare.train <- data.frame(train_y, round(p2))
tab2 <- table(compare.train)


tab2

       round.p2.
train_y   0   1
      0 438  50
      1  67 247

sum(diag(tab2))/sum(tab2) # predictions on training data

[1] 0.8541147

tab <- table(compare)
tab

      round.pred_y.
test_y  0  1
     0 50 11
     1  6 22

sum(diag(tab))/sum(tab) # predictions on test data

[1] 0.8089888

library(pROC)
auc(compare.train$train_y, compare.train$round.p2.) # Training Set

Area under the curve: 0.8421

auc(compare$test_y, compare$round.pred_y.) # Test Set

Area under the curve: 0.8027

This model over fits by about 4%.

OR 9

Train model #2 on the dataset; produce the probability predictions for survival, then compute the AUC

Multilevel Logistic (Softmax)

# ================================================================================ Functions
final.convert <- function(val, threshold){
  if(val >= logCost){
    decided = 1
  }else{
    decided = 0
  }
  return(decided)
}

softmax <- function(par){
  n.par <- length(par)
  par1 <- sort(par, decreasing = TRUE)
  Lk <- par1[1]
  for (k in 1:(n.par-1)) {
    Lk <- max(par1[k+1], Lk) + log1p(exp(-abs(par1[k+1] - Lk))) 
  }
  val <- exp(par - Lk)
  return(val)
}

# ================================================================================= Splitting Data
indexes <- createDataPartition(ttr2$Survived, p=0.90, list=F)
train <- data[indexes,]
test <- data[-indexes,]

attach(train)
X <- cbind(Pclass, Sex, Age, SibSp)
Y <- Survived

logit <- glm(Y~ X, family = binomial (link="logit"))
summary(logit)


Call:
glm(formula = Y ~ X, family = binomial(link = "logit"))

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.6800  -0.5823  -0.4124   0.6056   2.4524  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)  2.402315   0.450224   5.336 9.51e-08 ***
XPclass     -1.187602   0.127179  -9.338  < 2e-16 ***
XSex         2.755420   0.205882  13.384  < 2e-16 ***
XAge        -0.038220   0.008202  -4.660 3.17e-06 ***
XSibSp      -0.330431   0.108109  -3.056  0.00224 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1067.33  on 801  degrees of freedom
Residual deviance:  706.03  on 797  degrees of freedom
AIC: 716.03

Number of Fisher Scoring iterations: 5

l1 <- predict(logit, type="response")
s.max <- softmax(l1)
compare <- data.frame(train$Survived, s.max)
eval <- data.frame(train$Survived, l1)

# ================================================================================= Fitting Test Set
attach(test)
X <- cbind(Pclass, Sex, Age, SibSp)
Y <- Survived

l2 <- predict(logit, test, type="response")
final.compare <- data.frame(test$Survived, l2)

# ================================================================================= Cost Function
cost = 0
for(i in 1:nrow(compare)){
  current = compare[i,1] * compare[i,2]
  cost = cost + current
}
logCost <- -(log(cost))

Table Comparisons and AUC (Softmax)

for(i in 1:nrow(final.compare)){ 
  final = final.convert(final.compare[i,2], logCost)
  final.compare[i,2] = final
}

for(i in 1:nrow(eval)){
  final = final.convert(eval[i,2], logCost)
  eval[i,2] = final
}

tab1 <- table(eval)
tab1

              l1
train.Survived   0   1
             0 485  10
             1 161 146

sum(diag(tab1))/sum(tab1) # predictions on training data

[1] 0.786783

tab7 <- table(final.compare)
tab7

             l2
test.Survived  0  1
            0 54  0
            1 17 18

sum(diag(tab7))/sum(tab7)  # predictions on test data

[1] 0.8089888

auc(eval$train.Survived, eval$l1) # Training Set

Area under the curve: 0.7277

auc(final.compare$test.Survived, final.compare$l2) # Test Set

Area under the curve: 0.7571

This model could learn more with some tweaks.

OR 10

Discuss advantages and disadvantages of the two models - which one ended up performing better? Are you comfortable with the results or what else would you like to know before “betting on this model”?

The KNN model has a higher AUC than the Softmax model, with slightly more variation between its train and test sets. This difference occurs when overfitting is present so there is definitely room for improvement. The Softmax model likely has overfitting present based on its AUC scores too, but slightly less than the KNN model.

When comparing model predictions against each other, the Softmax accurately predicts a smaller proportion on its training set than it does on its test set. In contrast, the KNN model has a difference of almost 4% between its train and test set predictions with its train set being higher. This suggests that the model thinks it knows more than it actually does when handling unseen data. If both models increased their prediction performance on their training sets, the Softmax model would have more room for improvement.

Given that both models score almost identically when predicting un-seen data, I think I would put my money on the Softmax model. Given the potential room for improvement when comparing both models, I would put my money on the Softmax model at this time. There are certainly more tweaks that could be performed to find an optimal train/test split among other methods for improvement. The Softmax model is simply the more precise model so that is my choice.

HW2

Silas Selfe

2/13/2022