Kaggle Titanic Survival
Following is a page snapshot during the data mining competition hosted by kaggle.com for my STATS202 course at Stanford. My explorations and models in R follow.
This was my first Kaggle contest in 2012 during my first graduate course in Data Mining. I learned that you can easily overfit the Kaggle public leaderboard! I felt accomplished after breaking the Top 10, but I was fooling myself. I knew how to avoid overfitting a test set when tuning a model, but hadn't been exposed yet to bias-variance tradeoff as a way of thinking about models.
Kaggle keeps a private set of holdout data, and prediction accuracy against that holdout determines the final rankings. I ended up tweaking the bias down to raise my public score, while pushing the variance up when exposed to the unseen private holdout data. This newbie trap is explained well here.
# TEST data for generating predictions to submit:
#
# ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/test.csv", header=TRUE, sep=",")
# IDEAS -- TRY GLM and SVM
# IDEAS -- TRY Family name groupings!
#
# jj11E - svm EVAL script - choose BEST param values, then use jj11 routine to create test predictions for SUBMIT
# -------
jj11E = function() {
# accuracy w 1/3 holdout train data : train test
# -------------------------------------------- --------- ---------
# svm 8SVAR as-factors radial gamma=.09 tolerance=df : 0.8361953 0.8171717
# svm 14VAR as-factors radial gamma=.07 tolerance=df : 0.8378788 0.8259259 0.77990 << -- Kaggle public score as _jj11H matches _jj11F
# svm 14VAR as-factors radial gamma=.1 tolerance=.3 : 0.8552189 0.8289562
# svm 14VAR as-factors radial gamma=.1 tolerance=df : 0.8538721 0.8259259
# svm 14VAR as-factors radial gamma=.06 tolerance=df : 0.8350168 0.8235690
# svm 14VAR as-factors radial gamma=.1 tolerance=df : 0.xxxxxxx 0.xxxxxxx 0.77990 << -- Kaggle public score as _jj11F (BAD ver of 14VAR)
# svm 12VAR as-factors radial gamma=.2 tolerance=.4 : 0.8760943 0.8282828
# svm 12VAR as-factors radial gamma=.2 tolerance=df : 0.8771044 0.8276094
# svm 12VAR as-factors radial gamma=.1 tolerance=df : 0.8484848 0.8323232 0.78469 << -- Kaggle public score as _jj11C
# svm 12VAR as-factors polynomial gamma=.1 : 0.8664983 0.8255892
# svm 12VAR as-factors polynomial gamma=.1 degree=2 : 0.8505051 0.8313131 0.77512 << -- Kaggle public score as _jj11D
# svm 9VAR as-factors linear gamma=.1 : 0.8185185 0.8026936 NOTE: gamma not used by linear kernel
# svm 9VAR as-factors polynomial gamma=.2 : 0.8816498 0.8171717
# svm 9VAR as-factors polynomial gamma=.2 degree=2 : 0.8631313 0.8202020 0.77990 << -- Kaggle public score as _jj11A
# svm 9VAR as-factors radial gamma=.2 tolerance=.7 : 0.8595960 0.8249158 0.79426 << -- Kaggle public score as _jj11B
# svm 9VAR as-factors radial gamma=.2 : 0.8580808 0.8225589
# svm 9VAR as-numerics linear gamma=.1 : 0.7888889 0.783165
# svm 9VAR as-numerics linear scale=F gamma=.1 : 0.7883838 0.7835017
# * 9VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3
# * 12VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3
# * 14VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX
# * 8SVAR = sibsp +pclass +cabincode +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX
# (8 vars selected BASED on variable influence calculated by gbm)
print('=============== MODEL: jj11E svm trainV2 variations (xxxx)===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
ti.train$survived = as.factor(ti.train$survived)
ti.train$sibsp = as.factor(ti.train$sibsp)
ti.train$parch = as.factor(ti.train$parch)
ti.train$pclass = as.factor(ti.train$pclass)
ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
ti.train$agebin3 = as.factor(ti.train$agebin3)
ti.train$farebin3 = as.factor(ti.train$farebin3)
# -------- PROPOSED Secret Sauce primer: ADD combo variables to get from 9VAR to 12VAR -----------
ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
colnames( ti.train)[14] = "sex_pclass"
ti.train[,14] = as.factor( ti.train[,14])
ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
colnames(ti.train)[15] = "sex_agebin3"
ti.train[,15] = as.factor(ti.train[,15])
ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
colnames(ti.train)[16] = "sex_farebin3"
ti.train[,16] = as.factor(ti.train[,16])
ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[17] = "sex_cabinX"
ti.train[,17] = as.factor(ti.train[,17])
ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[18] = "pclass_cabinX"
ti.train[,18] = as.factor(ti.train[,18])
# Build a table to collect data for the next two steps: colnames = batch, gam, trainerr, testerr
batch = numeric()
gam = numeric()
trainerr = numeric()
testerr = numeric()
trainacc = numeric()
testacc = numeric()
ti.table = data.frame(batch=batch, gam=gam, trainerr=trainerr, testerr=testerr, trainacc=trainacc, testacc=testacc)
# Make the seed vector to serve as unique sample seed and also as batch number 1-10
seedvector = 1:10
library(e1071)
par(mfrow=c(2,3))
#### gamvector = c(.0004, .0006, .0008, .001, .0012, .0014, .0016, .002, .0025, .003) ## c(.1, .2)
gamvector = c(.03, .04, .05, .06, .07, .08, .09, .1, .12, .2)
#### gamvector = c(.1, .3, .5, .7, .9, 1.2, 1.5, 2.0, 3.0, 4.0)
# Iterate over seedvector and gamvector to train and test batches of trees
for (s in 1:length(seedvector)) {
# Reset sample seed
set.seed(seedvector[s])
# Slice new training and test sets with same ratio
testids = sample(nrow(ti.train), nrow(ti.train)/3)
ti.testset = ti.train[testids, ]
ti.trainset = ti.train[ - testids, ]
# Iterate the gam values for this batch
for (i in 1:length(gamvector)) {
# 9VAR version:
# ti.svm = svm(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3,
# 14VAR version:
#ti.svm = svm(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3
# +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX +pclass_cabinX,
# 8SVAR version:
ti.svm = svm(survived ~ sibsp +pclass +cabincode +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX,
data = ti.trainset,
kernel = "radial", ## "polynomial", ## "sigmoid", ## "linear", ##
gamma= gamvector[i]) ## .1) ## .1,
# degree= gamvector[i])
# tolerance= gamvector[i])
ti.train.predictions = predict(ti.svm)
# 9VAR version:
# ti.test.predictions = predict(ti.svm, ti.testset[, c("sex", "sibsp", "parch", "pclass",
# "embarked", "cabincode", "caibinlen5", "agebin3", "farebin3")])
# 14VAR version:
# ti.test.predictions = predict(ti.svm, ti.testset[, c("sex", "sibsp", "parch", "pclass",
# "embarked", "cabincode", "caibinlen5", "agebin3", "farebin3",
# "sex_pclass", "sex_agebin3", "sex_farebin3", "sex_cabinX", "pclass_cabinX" )])
# 8SVAR version:
ti.test.predictions = predict(ti.svm, ti.testset[, c("sibsp", "pclass",
"cabincode", "sex_pclass", "sex_agebin3", "sex_farebin3", "sex_cabinX", "pclass_cabinX" )])
ti.train.error = sum(ti.train.predictions != ti.trainset$survived)/nrow(ti.trainset)
ti.test.error = sum(ti.test.predictions != ti.testset$survived)/nrow(ti.testset)
ti.train.acc = accuracyFrom2x2ConfusionMatrix(table(ti.train.predictions, ti.trainset$survived))
ti.test.acc = accuracyFrom2x2ConfusionMatrix(table(ti.test.predictions, ti.testset$survived))
# Populate a row of the table... s now serves as batch number in col 1
ti.table[s*length(gamvector) -length(gamvector) +i,] =
c(s, gamvector[i], ti.train.error, ti.test.error, ti.train.acc, ti.test.acc)
}
}
# Sanity check of ti table looks good – we have 10 batches of 10 with same gamma series
dim(ti.table)
ti.table[1:10,]
# Use existing table as template to create a table for the train and test errors avg by gamma
ti.table.gammeans = ti.table[1==0,]
# Empty table for gammeans
dim(ti.table.gammeans)
# Populate the means by gam
for (i in 1:length(gamvector)) {
ti.table.gammeans[i,] = c(1, gamvector[i],
mean(ti.table[ti.table$gam == gamvector[i], 3]),
mean(ti.table[ti.table$gam == gamvector[i], 4]),
mean(ti.table[ti.table$gam == gamvector[i], 5]),
mean(ti.table[ti.table$gam == gamvector[i], 6]))
}
# Sanity check – series look smoother
ti.table.gammeans
par(mfrow=c(2,1))
plot(ti.table.gammeans$gam, ti.table.gammeans$trainerr, pch=22, bg="blue", type="b",
main="Avg Training vs Test Error by gam param", xlab="gam param", ylab="Error", ylim=c(.10, .21))
points(ti.table.gammeans$gam, ti.table.gammeans$testerr, pch=22, bg="red", type="b")
plot(ti.table.gammeans$gam, ti.table.gammeans$trainacc, pch=22, bg="blue", type="b",
main="Avg Training vs Test Accuracy by gam param", xlab="gam param", ylab="Accuracy", ylim=c(.78, .90))
points(ti.table.gammeans$gam, ti.table.gammeans$testacc, pch=22, bg="red", type="b")
print('=============== DONE: function jj11E =================')
}
#
# jj11
# -------
jj11 = function() {
# SEE function jj11E for SUBMITTED test predictions using svm...
print('=============== MODEL: jj11 svm trainV2 as factors ===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
ti.train$survived = as.factor(ti.train$survived)
ti.train$sibsp = as.factor(ti.train$sibsp)
ti.train$parch = as.factor(ti.train$parch)
ti.train$pclass = as.factor(ti.train$pclass)
ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
ti.train$agebin3 = as.factor(ti.train$agebin3)
ti.train$farebin3 = as.factor(ti.train$farebin3)
# -------- PROPOSED Secret Sauce primer: ADD combo variables to get from 9VAR to 12VAR -----------
ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
colnames( ti.train)[14] = "sex_pclass"
ti.train[,14] = as.factor( ti.train[,14])
ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
colnames(ti.train)[15] = "sex_agebin3"
ti.train[,15] = as.factor(ti.train[,15])
ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
colnames(ti.train)[16] = "sex_farebin3"
ti.train[,16] = as.factor(ti.train[,16])
ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[17] = "sex_cabinX"
ti.train[,17] = as.factor(ti.train[,17])
ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[18] = "pclass_cabinX"
ti.train[,18] = as.factor(ti.train[,18])
library(e1071)
# 9VAR version:
# ti.svm = svm(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3,
# 14VAR version:
ti.svm = svm(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3
+sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX +pclass_cabinX,
data = ti.train,
kernel = "radial", ## "polynomial", ##
gamma= 0.07)
## degree= 2)
## tolerance= 0.3)
ti.train.pred = predict(ti.svm)
ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
print(ti.train.tab)
print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
print('=============== MODEL: jj11 starting TEST predictions ===============')
# Load the testV2 TEST data -
# DATA PREP done in Excel
ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")
ti.test$sibsp = as.factor(ti.test$sibsp)
ti.test$parch = as.factor(ti.test$parch)
ti.test$pclass = as.factor(ti.test$pclass)
ti.test$caibinlen5 = as.factor(ti.test$caibinlen5)
ti.test$agebin3 = as.factor(ti.test$agebin3)
ti.test$farebin3 = as.factor(ti.test$farebin3)
# -------- PROPOSED Secret Sauce primer: ADD combo variables to get from 9VAR to 12VAR -----------
ti.test[,14] = paste(ti.test[,"sex"], ti.test[,"pclass"], sep="")
colnames( ti.test)[14] = "sex_pclass"
ti.test[,14] = as.factor( ti.test[,14])
ti.test[,15] = paste(ti.test[,"sex"], ti.test[,"agebin3"], sep="")
colnames(ti.test)[15] = "sex_agebin3"
ti.test[,15] = as.factor(ti.test[,15])
ti.test[,16] = paste(ti.test[,"sex"], ti.test[,"farebin3"], sep="")
colnames(ti.test)[16] = "sex_farebin3"
ti.test[,16] = as.factor(ti.test[,16])
ti.test[,17] = paste(ti.test[,"sex"], ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
colnames(ti.test)[17] = "sex_cabinX"
ti.test[,17] = as.factor(ti.test[,17])
ti.test[,18] = paste(ti.test[,"pclass"],ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
colnames(ti.test)[18] = "pclass_cabinX"
ti.test[,18] = as.factor(ti.test[,18])
print("test predictions being output to ti.test.pred...")
# 9VAR version:
# ti.test.pred = predict(ti.svm, ti.test[, c("sex", "sibsp", "parch", "pclass",
# "embarked", "cabincode", "caibinlen5", "agebin3", "farebin3")])
# 14VAR version:
ti.test.pred = predict(ti.svm, ti.test[, c("sex", "sibsp", "parch", "pclass",
"embarked", "cabincode", "caibinlen5", "agebin3", "farebin3", "sex_pclass", "sex_agebin3", "sex_farebin3",
"sex_cabinX", "pclass_cabinX" )])
# Save predictions
write.csv(ti.test.pred, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj11H.csv",
row.names=FALSE, col.names=FALSE, quote = FALSE)
}
#
# jj91E - 14VAR gbm EVAL script - choose BEST params, then use jj91 routine to create test predictions for SUBMIT
# -------
jj91E = function() {
# accuracy w 1/3 holdout train data 14VAR : train-avg test-avg
# --------------------------------------------------------------------------------------------- --------- ---------
# gaussian n.tree= 100 (0->1 cutoff .40) shrinkage=.001 bag.fraction=dflt interaction.depth=7 0.8466330 0.8242424 DEC 4: TRY gaussian, test cuffoff pct...
# gaussian n.tree= 500 (0->1 cutoff .40) shrinkage=.001 bag.fraction=dflt interaction.depth=4 0.8506734 0.8249158
# gaussian n.tree= 500 (0->1 cutoff .40) shrinkage=.001 bag.fraction=dflt interaction.depth=2 0.8109428 0.8087542 <-- diff .002
# bernoulli n.tree= 100 (0->1 cutoff -.40) shrinkage=.001 bag.fraction=dflt interaction.depth=4 0.8338384 0.8228956 <-- diff .011
# bernoulli n.tree= 500 (0->1 cutoff -.40) shrinkage=.001 bag.fraction=dflt interaction.depth=2 0.8144781 0.8111111 <-- diff .003
# bernoulli n.tree= 500 (0->1 cutoff -.40) shrinkage=.001 bag.fraction=dflt interaction.depth=4 0.8488215 0.8242424
# bernoulli n.tree= 500 (0->1 cutoff -.40) shrinkage=.006 bag.fraction=dflt interaction.depth=2 0.8434343 0.8245791
# bernoulli n.tree=2000 (0->1 cutoff .45) shrinkage=.008 bag.fraction=dflt interaction.depth=1 0.8309764 0.8272727 <-- diff .004 0.76555 Kaggle public score as _jj91B
# bernoulli n.tree= 500 (0->1 cutoff .45) shrinkage=.012 bag.fraction=dflt interaction.depth=2 0.8365320 0.8279461 <-- diff .008 0.xxxxx _jj91C only 1DIFF from _jj91B
# bernoulli n.tree= 500 (0->1 cutoff -.36) shrinkage=.001 bag.fraction=dflt interaction.depth=2 0.8190236 0.8134680 <-- diff .005 0.78469 Kaggle public score as _jj91A
# bernoulli n.tree= 500 (0->1 cutoff -.36) shrinkage=.001 bag.fraction=dflt interaction.depth=3 0.8380471 0.8232323
# bernoulli n.tree=2000 (0->1 cutoff -.36) shrinkage=.001 bag.fraction=dflt interaction.depth=3 0.8356902 0.8158249 <-- diff .019
# bernoulli n.tree=8000 (0->1 cutoff -.36) shrinkage=.001 bag.fraction=dflt interaction.depth=1 0.8439394 0.8309764 <-- diff .013 0.77990 Kaggle public score as _jj91
# bernoulli n.tree=8000 (0->1 cutoff -.36) shrinkage=.001 bag.fraction=dflt interaction.depth=2 0.8550505 0.8313131
print('=============== MODEL: jj91E gbm trainV2 variations (xxxx)===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
# All vars are as-read, NO use of as.factor(...)
# -------- PROPOSED Secret Sauce primer: ADD combo variables to get from 9VAR to 12VAR -----------
ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
colnames( ti.train)[14] = "sex_pclass"
ti.train[,14] = as.factor( ti.train[,14])
ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
colnames(ti.train)[15] = "sex_agebin3"
ti.train[,15] = as.factor(ti.train[,15])
ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
colnames(ti.train)[16] = "sex_farebin3"
ti.train[,16] = as.factor(ti.train[,16])
ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[17] = "sex_cabinX"
ti.train[,17] = as.factor(ti.train[,17])
ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[18] = "pclass_cabinX"
ti.train[,18] = as.factor(ti.train[,18])
# Build a table to collect data for the next two steps: colnames = batch, cp, trainerr, testerr
batch = numeric()
idepth = numeric()
trainerr = numeric()
testerr = numeric()
trainacc = numeric()
testacc = numeric()
ti.table = data.frame(batch=batch, idepth=idepth, trainerr=trainerr, testerr=testerr, trainacc=trainacc, testacc=testacc)
# Make the seed vector to serve as unique sample seed and also as batch number 1-10
seedvector = 1:10 ## 1:5 ##
library(gbm)
par(mfrow=c(2,5))
### nvector = 1:4 ## 1:10
### nvector = 8 ## c(.0400, .0350, .0300, .0250, .0200, .0150, .0100, .0080, .0060, .0040)
### nvector = c(.006, .008, .01, .012, .014)
nvector = c(.008, .01, .012, .014, .016, .018, .02, .022, .024)
### nvector = c(100, 300, 400, 500, 600, 700, 800, 1000)
### nvector = c(.2, .3, .35, .4, .45, .5, .6)
# Iterate over seedvector and nvector to train and test batches of trees
for (s in 1:length(seedvector)) {
# Reset sample seed
set.seed(seedvector[s])
# Slice new training and test sets with same ratio
testids = sample(nrow(ti.train), nrow(ti.train)/3)
ti.testset = ti.train[testids, ]
ti.trainset = ti.train[ - testids, ]
# Iterate the idepth values for this batch
for (i in 1:length(nvector)) {
ti.gbm = gbm(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3
+sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX +pclass_cabinX,
data=ti.trainset,
distribution="bernoulli",
n.trees=2000,
shrinkage=nvector[i],
interaction.depth = 1,
cv.folds = 5)
# ---------- Perf evals from R docs --------------
# check performance using 5-fold cross-validation
best.iter <- gbm.perf(ti.gbm, method="cv")
print(best.iter)
# ---------- Perf evals from docs --------------
ti.train.pred = predict(ti.gbm, ti.trainset, best.iter)
ti.test.pred = predict(ti.gbm, ti.testset, best.iter)
# ti.train.pred01 = ifelse(ti.train.pred < 0.5, 0, 1)
# ti.test.pred01 = ifelse(ti.test.pred < 0.5, 0, 1)
# use for gaussian?
ti.train.pred01 = ifelse(ti.train.pred < 0.45, 0, 1)
ti.test.pred01 = ifelse(ti.test.pred < 0.45, 0, 1)
# ti.train.pred01 = ifelse(ti.train.pred < nvector[i], 0, 1)
# ti.test.pred01 = ifelse(ti.test.pred < nvector[i], 0, 1)
# use for bernoulli? NOTE these values are NEGATIVE
# ti.train.pred01 = ifelse(ti.train.pred < -0.36, 0, 1)
# ti.test.pred01 = ifelse(ti.test.pred < -0.36, 0, 1)
ti.train.error = sum(ti.train.pred01 != ti.trainset$survived)/nrow(ti.trainset)
ti.test.error = sum(ti.test.pred01 != ti.testset$survived)/nrow(ti.testset)
ti.train.acc = accuracyFrom2x2ConfusionMatrix(table(ti.train.pred01, ti.trainset$survived))
ti.test.acc = accuracyFrom2x2ConfusionMatrix(table(ti.test.pred01, ti.testset$survived))
# Populate a row of the table... s now serves as batch number in col 1
ti.table[s*length(nvector) -length(nvector) +i,] =
c(s, nvector[i], ti.train.error, ti.test.error, ti.train.acc, ti.test.acc)
}
}
# Sanity check of ti table looks good – we have 10 batches of 10 with same idepth series
dim(ti.table)
ti.table[1:10,]
# Use existing table as template to create a table for the train and test errors avg by idepth
ti.table.nmeans = ti.table[1==0,]
# Empty table for nmeans
dim(ti.table.nmeans)
# Populate the means by idepth
for (i in 1:length(nvector)) {
ti.table.nmeans[i,] = c(1, nvector[i],
mean(ti.table[ti.table$idepth == nvector[i], 3]),
mean(ti.table[ti.table$idepth == nvector[i], 4]),
mean(ti.table[ti.table$idepth == nvector[i], 5]),
mean(ti.table[ti.table$idepth == nvector[i], 6]))
}
# Sanity check – series look smoother
ti.table.nmeans
par(mfrow=c(2,1))
plot(ti.table.nmeans$idepth, ti.table.nmeans$trainerr, pch=22, bg="blue", type="b",
main="Avg Training vs Test Error by idepth param", xlab="idepth param", ylab="Error", ylim=c(.10, .40))
points(ti.table.nmeans$idepth, ti.table.nmeans$testerr, pch=22, bg="red", type="b")
plot(ti.table.nmeans$idepth, ti.table.nmeans$trainacc, pch=22, bg="blue", type="b",
main="Avg Training vs Test Accuracy by idepth param", xlab="idepth param", ylab="Accuracy", ylim=c(.60, .90))
points(ti.table.nmeans$idepth, ti.table.nmeans$testacc, pch=22, bg="red", type="b")
print('=============== END jj91E ===============')
}
#
# jj91
# -------
jj91 = function() {
print('=============== MODEL: jj91 gbm trainV2 as read in (8810) ===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
# -------- PROPOSED Secret Sauce primer: ADD combo variables to get from 9VAR to 12VAR -----------
ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
colnames( ti.train)[14] = "sex_pclass"
ti.train[,14] = as.factor( ti.train[,14])
ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
colnames(ti.train)[15] = "sex_agebin3"
ti.train[,15] = as.factor(ti.train[,15])
ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
colnames(ti.train)[16] = "sex_farebin3"
ti.train[,16] = as.factor(ti.train[,16])
ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[17] = "sex_cabinX"
ti.train[,17] = as.factor(ti.train[,17])
ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[18] = "pclass_cabinX"
ti.train[,18] = as.factor(ti.train[,18])
# Note - response var must be numeric for gbm
library(gbm)
ti.gbm = gbm(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin +farebin3
+sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX +pclass_cabinX,
data=ti.train,
distribution="bernoulli",
n.trees=500,
interaction.depth = 2,
shrinkage=0.012,
cv.folds = 5)
par(mfrow=c(1,3))
print(plot(ti.gbm))
# ---------- Perf evals from R docs --------------
# check performance using 5-fold cross-validation
best.iter <- gbm.perf(ti.gbm,method="cv")
print(best.iter)
# plot variable influence
summary(ti.gbm,n.trees=best.iter) # based on the estimated best number of trees
# ---------- Perf evals from docs --------------
ti.train.pred = predict(ti.gbm, ti.train, best.iter)
ti.train.pred01 = ifelse(ti.train.pred < 0.45, 0, 1)
ti.train.tab = table(ti.train.pred01, ti.train$survived, dnn=c("predicted","actual"))
print(ti.train.tab)
print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
print('=============== MODEL: jj91 starting TEST predictions ===============')
# Load the testV2 TRAINING data -
# DATA PREP done in Excel
ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")
ti.test[,14] = paste(ti.test[,"sex"], ti.test[,"pclass"], sep="")
colnames( ti.test)[14] = "sex_pclass"
ti.test[,14] = as.factor( ti.test[,14])
ti.test[,15] = paste(ti.test[,"sex"], ti.test[,"agebin3"], sep="")
colnames(ti.test)[15] = "sex_agebin3"
ti.test[,15] = as.factor(ti.test[,15])
ti.test[,16] = paste(ti.test[,"sex"], ti.test[,"farebin3"], sep="")
colnames(ti.test)[16] = "sex_farebin3"
ti.test[,16] = as.factor(ti.test[,16])
ti.test[,17] = paste(ti.test[,"sex"], ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
colnames(ti.test)[17] = "sex_cabinX"
ti.test[,17] = as.factor(ti.test[,17])
ti.test[,18] = paste(ti.test[,"pclass"],ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
colnames(ti.test)[18] = "pclass_cabinX"
ti.test[,18] = as.factor(ti.test[,18])
print("test predictions being output to ti.test.pred...")
ti.test.pred = predict(ti.gbm, ti.test, best.iter)
ti.test.pred01 = ifelse(ti.test.pred < 0.45, 0, 1)
# Save predictions
write.csv(ti.test.pred01, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj91C.csv",
row.names=FALSE, col.names=FALSE, quote = FALSE)
}
#
# jj9E - gbm EVAL script - choose BEST params, then use jj9 routine to create test predictions for SUBMIT
# -------
jj9E = function() {
# accuracy w 1/3 holdout train data : train-avg test-avg
# --------------------------------------------------------------------------- ------ ------
# gaussian n.tree=2000 shrinkage=.003 bag.fraction=dflt interaction.depth=8 : 0.8632997 0.8720539 0.78469 << -- (BAD traindata) Kaggle public score as _jj9
# gaussian n.tree=2000 shrinkage=.003 bag.fraction=0.7 interaction.depth=8 : 0.8663300 0.8754209
# gaussian n.tree=2000 shrinkage=.003 bag.fraction=0.2 interaction.depth=8 : 0.8613917 0.8709315 0.76555 << -- (BAD traindata) Kaggle public score as _jj9B
# bernoulli n.tree=2000 shrinkage=.010 bag.fraction=dflt interaction.depth=5 : 0.8479237 0.8563412
# bernoulli n.tree=2000 shrinkage=.003 bag.fraction=dflt interaction.depth=6 : 0.8473625 0.8574635
# bernoulli n.tree=9000 shrinkage=.001 bag.fraction=dflt interaction.depth=5 : 0.8462402 0.8552189 0.74641 << -- (BAD traindata) Kaggle public score as _jj9C
# adaboost n.tree=9000 shrinkage=.001 bag.fraction=dflt interaction.depth=7 : 0.8187430 0.8148148
print('=============== MODEL: jj9E gbm trainV2 variations (xxxx)===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
# All vars are as-read, NO use of as.factor(...)
# Build a table to collect data for the next two steps: colnames = batch, cp, trainerr, testerr
batch = numeric()
idepth = numeric()
trainerr = numeric()
testerr = numeric()
trainacc = numeric()
testacc = numeric()
ti.table = data.frame(batch=batch, idepth=idepth, trainerr=trainerr, testerr=testerr, trainacc=trainacc, testacc=testacc)
# Make the seed vector to serve as unique sample seed and also as batch number 1-10
seedvector = 1:3 ## 1:5 ## 1:10
library(gbm)
par(mfrow=c(2,5))
idepthvector = 8 ## c(.0400, .0350, .0300, .0250, .0200, .0150, .0100, .0080, .0060, .0040)
# Iterate over seedvector and idepthvector to train and test batches of trees
for (s in 1:length(seedvector)) {
# Reset sample seed
set.seed(seedvector[s])
# Slice new training and test sets with same ratio
testids = sample(nrow(ti.train), nrow(ti.train)/3)
ti.testset = ti.train[testids, ]
ti.trainset = ti.train[ - testids, ]
# Iterate the idepth values for this batch
for (i in 1:length(idepthvector)) {
ti.gbm = gbm(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3,
## data=ti.train,
## corrected to trainset!
data=ti.trainset,
distribution="bernoulli", # "gaussian", # "adaboost", ##
n.trees=9000, # 10000,
shrinkage=0.001,
interaction.depth = idepthvector[i],
cv.folds = 5)
# ---------- Perf evals from R docs --------------
# check performance using 5-fold cross-validation
best.iter <- gbm.perf(ti.gbm, method="cv")
print(best.iter)
# ---------- Perf evals from docs --------------
ti.train.pred = predict(ti.gbm, ti.trainset, best.iter)
ti.train.pred01 = ifelse(ti.train.pred < 0.5, 0, 1)
ti.test.pred = predict(ti.gbm, ti.testset, best.iter)
ti.test.pred01 = ifelse(ti.test.pred < 0.5, 0, 1)
ti.train.error = sum(ti.train.pred01 != ti.trainset$survived)/nrow(ti.trainset)
ti.test.error = sum(ti.test.pred01 != ti.testset$survived)/nrow(ti.testset)
ti.train.acc = accuracyFrom2x2ConfusionMatrix(table(ti.train.pred01, ti.trainset$survived))
ti.test.acc = accuracyFrom2x2ConfusionMatrix(table(ti.test.pred01, ti.testset$survived))
# Populate a row of the table... s now serves as batch number in col 1
ti.table[s*length(idepthvector) -length(idepthvector) +i,] =
c(s, idepthvector[i], ti.train.error, ti.test.error, ti.train.acc, ti.test.acc)
}
}
# Sanity check of ti table looks good – we have 10 batches of 10 with same idepth series
dim(ti.table)
ti.table[1:10,]
# Use existing table as template to create a table for the train and test errors avg by idepth
ti.table.idepthmeans = ti.table[1==0,]
# Empty table for idepthmeans
dim(ti.table.idepthmeans)
# Populate the means by idepth
for (i in 1:length(idepthvector)) {
ti.table.idepthmeans[i,] = c(1, idepthvector[i],
mean(ti.table[ti.table$idepth == idepthvector[i], 3]),
mean(ti.table[ti.table$idepth == idepthvector[i], 4]),
mean(ti.table[ti.table$idepth == idepthvector[i], 5]),
mean(ti.table[ti.table$idepth == idepthvector[i], 6]))
}
# Sanity check – series look smoother
ti.table.idepthmeans
par(mfrow=c(2,1))
plot(ti.table.idepthmeans$idepth, ti.table.idepthmeans$trainerr, pch=22, bg="blue", type="b",
main="Avg Training vs Test Error by idepth param", xlab="idepth param", ylab="Error", ylim=c(.10, .40))
points(ti.table.idepthmeans$idepth, ti.table.idepthmeans$testerr, pch=22, bg="red", type="b")
plot(ti.table.idepthmeans$idepth, ti.table.idepthmeans$trainacc, pch=22, bg="blue", type="b",
main="Avg Training vs Test Accuracy by idepth param", xlab="idepth param", ylab="Accuracy", ylim=c(.60, .90))
points(ti.table.idepthmeans$idepth, ti.table.idepthmeans$testacc, pch=22, bg="red", type="b")
print('=============== MODEL: jj9E starting TEST predictions ===============')
}
#
# jj9
# -------
jj9 = function() {
print('=============== MODEL: jj9 gbm trainV2 as read in (8810) ===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
# ti.train$sibsp = as.factor(ti.train$sibsp)
# ti.train$parch = as.factor(ti.train$parch)
# ti.train$pclass = as.factor(ti.train$pclass)
# ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
# ti.train$agebin3 = as.factor(ti.train$agebin3)
# ti.train$farebin3 = as.factor(ti.train$farebin3)
# Note - response var must be numeric for gbm
library(gbm)
ti.gbm = gbm(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3,
data=ti.train,
distribution="bernoulli", ## "gaussian", # "adaboost", #
n.trees=9000, interaction.depth = 5, shrinkage=0.001,
cv.folds = 5)
par(mfrow=c(2,1))
print(plot(ti.gbm))
# ---------- Perf evals from R docs --------------
# check performance using an out-of-bag estimator
# OOB underestimates the optimal number of iterations
best.iter <- gbm.perf(ti.gbm,method="OOB")
print(best.iter)
# check performance using 5-fold cross-validation
best.iter <- gbm.perf(ti.gbm,method="cv")
print(best.iter)
# plot variable influence
summary(ti.gbm,n.trees=best.iter) # based on the estimated best number of trees
# ---------- Perf evals from docs --------------
ti.train.pred = predict(ti.gbm, ti.train, best.iter)
ti.train.pred01 = ifelse(ti.train.pred < 0.5, 0, 1)
ti.train.tab = table(ti.train.pred01, ti.train$survived, dnn=c("predicted","actual"))
print(ti.train.tab)
print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
print('=============== MODEL: jj9 starting TEST predictions ===============')
# Load the testV2 TRAINING data -
# DATA PREP done in Excel
ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")
print("test predictions being output to ti.test.pred...")
# ti.test.pred = predict(ti.rf,
# ti.test[, c("pclass", "sex", "sibsp", "parch", "embarked", "cabincode", "caibinlen5", "agebin3", "farebin3")] )
ti.test.pred = predict(ti.gbm, ti.test, best.iter)
ti.test.pred01 = ifelse(ti.test.pred < 0.5, 0, 1)
# Save predictions
write.csv(ti.test.pred01, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj9C.csv",
row.names=FALSE, col.names=FALSE, quote = FALSE)
}
#
# jj8E - randomForest with 12VAR COMBO features EVAL script - choose BEST params, then MAKE/use jj8 routine to create test predictions for SUBMIT
# ------- -------------------------
jj8E = function() {
# accuracy w 1/3 holdout train data : train test
# -------------------------------------------- --------- ---------
# * 9VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3
# * 12VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3
# * 8SVAR = sibsp +pclass +cabincode +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX
print('=============== MODEL: jj8E randomForest trainV2 12VAR variations ===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
# COMMENT out survived to do REGRESSION (need to convert response to 0/1 value)
# ti.train$survived = as.factor(ti.train$survived)
ti.train$sibsp = as.factor(ti.train$sibsp)
ti.train$parch = as.factor(ti.train$parch)
ti.train$pclass = as.factor(ti.train$pclass)
ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
ti.train$agebin3 = as.factor(ti.train$agebin3)
ti.train$farebin3 = as.factor(ti.train$farebin3)
# -------- PROPOSED Secret Sauce primer: ADD combo variables -----------
ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
colnames( ti.train)[14] = "sex_pclass"
ti.train[,14] = as.factor( ti.train[,14])
ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
colnames(ti.train)[15] = "sex_agebin3"
ti.train[,15] = as.factor(ti.train[,15])
ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
colnames(ti.train)[16] = "sex_farebin3"
ti.train[,16] = as.factor(ti.train[,16])
ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[17] = "sex_cabinX"
ti.train[,17] = as.factor(ti.train[,17])
ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[18] = "pclass_cabinX"
ti.train[,18] = as.factor(ti.train[,18])
# Build a table to collect data for the next two steps: colnames = batch, nt, trainerr, testerr
batch = numeric()
nt = numeric()
trainerr = numeric()
testerr = numeric()
trainacc = numeric()
testacc = numeric()
ti.table = data.frame(batch=batch, nt=nt, trainerr=trainerr, testerr=testerr, trainacc=trainacc, testacc=testacc)
# Make the seed vector to serve as unique sample seed and also as batch number 1-10
seedvector = 1:10
ntvector = c(40, 60, 80, 100, 200, 400, 600, 1000, 2000, 4000)
library(randomForest)
# Iterate over seedvector and ntvector to train and test batches of trees
for (s in 1:length(seedvector)) {
# Reset sample seed
set.seed(seedvector[s])
# Slice new training and test sets with same ratio
testids = sample(nrow(ti.train), nrow(ti.train)/3)
ti.testset = ti.train[testids, ]
ti.trainset = ti.train[ - testids, ]
# Iterate the nt values for this batch
for (i in 1:length(ntvector)) {
# 12VAR version:
# ti.rf = randomForest(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3,
# 8SVAR version:
ti.rf = randomForest(survived ~ sibsp +pclass +cabincode +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX,
data = ti.trainset, nt=ntvector[i])
ti.train.predictions = predict(ti.rf)
ti.test.predictions = predict(ti.rf,
# 12VAR version:
# ti.testset[, c("pclass", "sex", "sibsp", "parch", "embarked", "cabincode", "caibinlen5", "agebin3", "farebin3", "sex_pclass", "sex_agebin3", "sex_farebin3")] )
# 8SVAR version:
ti.testset[, c("sibsp", "pclass", "cabincode", "sex_pclass", "sex_agebin3", "sex_farebin3", "sex_cabinX", "pclass_cabinX")] )
# ONLY if using regression instead of classification:
ti.train.predictions = ifelse(ti.train.predictions < .36, 0, 1)
ti.test.predictions = ifelse(ti.test.predictions < .36, 0, 1)
ti.train.error = sum(ti.train.predictions != ti.trainset$survived)/nrow(ti.trainset)
ti.test.error = sum(ti.test.predictions != ti.testset$survived)/nrow(ti.testset)
ti.train.acc = accuracyFrom2x2ConfusionMatrix(table(ti.train.predictions, ti.trainset$survived))
ti.test.acc = accuracyFrom2x2ConfusionMatrix(table(ti.test.predictions, ti.testset$survived))
# Populate a row of the table... s now serves as batch number in col 1
ti.table[s*10 -10 +i,] = c(s, ntvector[i], ti.train.error, ti.test.error, ti.train.acc, ti.test.acc)
}
}
# Sanity check of ti table looks good – we have 10 batches of 10 with same nt series
dim(ti.table)
ti.table[1:4,]
ti.table[11:14,]
# Use existing table as template to create a table for the train and test errors avg by nt
ti.table.ntmeans = ti.table[1==0,]
# Empty table for ntmeans
dim(ti.table.ntmeans)
# Populate the means by nt
for (i in 1:length(ntvector)) {
ti.table.ntmeans[i,] = c(1, ntvector[i],
mean(ti.table[ti.table$nt == ntvector[i], 3]),
mean(ti.table[ti.table$nt == ntvector[i], 4]),
mean(ti.table[ti.table$nt == ntvector[i], 5]),
mean(ti.table[ti.table$nt == ntvector[i], 6]))
}
# Sanity check – series look smoother
ti.table.ntmeans
par(mfrow=c(2,1))
plot(ti.table.ntmeans$nt, ti.table.ntmeans$trainerr, pch=22, bg="blue", type="b",
main="Avg Training vs Test Error by nt param", xlab="nt param", ylab="Error", ylim=c(.15, .21))
points(ti.table.ntmeans$nt, ti.table.ntmeans$testerr, pch=22, bg="red", type="b")
plot(ti.table.ntmeans$nt, ti.table.ntmeans$trainacc, pch=22, bg="blue", type="b",
main="Avg Training vs Test Accuracy by nt param", xlab="nt param", ylab="Accuracy", ylim=c(.78, .86))
points(ti.table.ntmeans$nt, ti.table.ntmeans$testacc, pch=22, bg="red", type="b")
print('=============== END: jj8E ===============')
}
#
# jj8
# -------
jj8 = function() {
print('=============== MODEL: jj8 randomForest trainV2 12VAR as factors ===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
# COMMENT 1st row to get regression response (must convert to 0/1 value)
# ti.train$survived = as.factor(ti.train$survived)
ti.train$sibsp = as.factor(ti.train$sibsp)
ti.train$parch = as.factor(ti.train$parch)
ti.train$pclass = as.factor(ti.train$pclass)
ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
ti.train$agebin3 = as.factor(ti.train$agebin3)
ti.train$farebin3 = as.factor(ti.train$farebin3)
# -------- PROPOSED Secret Sauce primer: ADD combo variables -----------
ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
colnames( ti.train)[14] = "sex_pclass"
ti.train[,14] = as.factor( ti.train[,14])
ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
colnames(ti.train)[15] = "sex_agebin3"
ti.train[,15] = as.factor(ti.train[,15])
ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
colnames(ti.train)[16] = "sex_farebin3"
ti.train[,16] = as.factor(ti.train[,16])
ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[17] = "sex_cabinX"
ti.train[,17] = as.factor(ti.train[,17])
ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[18] = "pclass_cabinX"
ti.train[,18] = as.factor(ti.train[,18])
library(randomForest)
# 12VAR version:
# ti.rf = randomForest(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3,
# data = ti.train, ntree = 100)
# 8SVAR version:
ti.rf = randomForest(survived ~ sibsp +pclass +cabincode +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX,
data = ti.train, ntree = 60)
# print(plot(ti.rf))
ti.train.pred = predict(ti.rf)
# ONLY if using regression instead of classification:
ti.train.pred = ifelse(ti.train.pred < .36, 0, 1)
ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
print(ti.train.tab)
print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
print('=============== MODEL: jj8 starting TEST predictions ===============')
# Load the testV2 TEST data -
# DATA PREP done in Excel
ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")
ti.test$sibsp = as.factor(ti.test$sibsp)
ti.test$parch = as.factor(ti.test$parch)
ti.test$pclass = as.factor(ti.test$pclass)
ti.test$caibinlen5 = as.factor(ti.test$caibinlen5)
ti.test$agebin3 = as.factor(ti.test$agebin3)
ti.test$farebin3 = as.factor(ti.test$farebin3)
# -------- PROPOSED Secret Sauce primer: ADD combo variables -----------
ti.test[,14] = paste(ti.test[,"sex"], ti.test[,"pclass"], sep="")
colnames( ti.test)[14] = "sex_pclass"
ti.test[,14] = as.factor( ti.test[,14])
ti.test[,15] = paste(ti.test[,"sex"], ti.test[,"agebin3"], sep="")
colnames(ti.test)[15] = "sex_agebin3"
ti.test[,15] = as.factor(ti.test[,15])
ti.test[,16] = paste(ti.test[,"sex"], ti.test[,"farebin3"], sep="")
colnames(ti.test)[16] = "sex_farebin3"
ti.test[,16] = as.factor(ti.test[,16])
ti.test[,17] = paste(ti.test[,"sex"], ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
colnames(ti.test)[17] = "sex_cabinX"
ti.test[,17] = as.factor(ti.test[,17])
ti.test[,18] = paste(ti.test[,"pclass"],ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
colnames(ti.test)[18] = "pclass_cabinX"
ti.test[,18] = as.factor(ti.test[,18])
print("test predictions being output to ti.test.pred...")
ti.test.pred = predict(ti.rf,
# 12VAR version:
# ti.test[, c("pclass", "sex", "sibsp", "parch", "embarked", "cabincode", "caibinlen5", "agebin3", "farebin3", "sex_pclass", "sex_agebin3", "sex_farebin3")] )
# 8SVAR version:
ti.test[, c("sibsp", "pclass", "cabincode", "sex_pclass", "sex_agebin3", "sex_farebin3", "sex_cabinX", "pclass_cabinX")] )
# ONLY if using regression instead of classification:
ti.test.pred = ifelse(ti.test.pred < .36, 0, 1)
# Save predictions
write.csv(ti.test.pred, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj8B.csv",
row.names=FALSE, col.names=FALSE, quote = FALSE)
}
#
# jj73E - rpart with 8SVAR COMBO features EVAL script - choose BEST cp value, then MAKE/use jj71 routine to create test predictions for SUBMIT
# ------- -------------------------
jj73E = function() {
# accuracy w 1/3 holdout train data : train test nleaf
# rpart 8SVAR as-factors minsplit = 10 BEST cp = 0.006 : 0.xxxxxxx 0.xxxxxxx 12+ 76077 << -- Kaggle public score as _jj73
# * 9VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3
# * 12VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3
# * 8SVAR = sibsp +pclass +cabincode +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX
print('=============== MODEL: jj73E rpart trainV2 12VAR variations (xxxx)===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
ti.train$survived = as.factor(ti.train$survived)
ti.train$sibsp = as.factor(ti.train$sibsp)
ti.train$parch = as.factor(ti.train$parch)
ti.train$pclass = as.factor(ti.train$pclass)
ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
ti.train$agebin3 = as.factor(ti.train$agebin3)
ti.train$farebin3 = as.factor(ti.train$farebin3)
# -------- PROPOSED Secret Sauce primer: ADD combo variables -----------
ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
colnames( ti.train)[14] = "sex_pclass"
ti.train[,14] = as.factor( ti.train[,14])
ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
colnames(ti.train)[15] = "sex_agebin3"
ti.train[,15] = as.factor(ti.train[,15])
ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
colnames(ti.train)[16] = "sex_farebin3"
ti.train[,16] = as.factor(ti.train[,16])
ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[17] = "sex_cabinX"
ti.train[,17] = as.factor(ti.train[,17])
ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[18] = "pclass_cabinX"
ti.train[,18] = as.factor(ti.train[,18])
# Build a table to collect data for the next two steps: colnames = batch, cp, trainerr, testerr
batch = numeric()
cp = numeric()
trainerr = numeric()
testerr = numeric()
trainacc = numeric()
testacc = numeric()
ti.table = data.frame(batch=batch, cp=cp, trainerr=trainerr, testerr=testerr, trainacc=trainacc, testacc=testacc)
# Make the seed vector to serve as unique sample seed and also as batch number 1-10
seedvector = 1:10
# cpvector = .015
cpvector = c(.0400, .0350, .0300, .0250, .0200, .0150, .0100, .0080, .0060, .0040)
par(mfrow = c(2,6))
# Iterate over seedvector and cpvector to train and test batches of trees
for (s in 1:length(seedvector)) {
# Reset sample seed
set.seed(seedvector[s])
# Slice new training and test sets with same ratio
testids = sample(nrow(ti.train), nrow(ti.train)/3)
ti.testset = ti.train[testids, ]
ti.trainset = ti.train[ - testids, ]
# Iterate the cp values for this batch
for (i in 1:length(cpvector)) {
ti.tree = rpart(survived ~ sibsp +pclass +cabincode +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX,
method = "class",
data = ti.trainset,
cp=cpvector[i],
minsplit = 10 )
# if (i %in% c(10)) { print(plot(ti.tree)); print(text(ti.tree)); }
ti.test.predictions = predict(ti.tree, ti.testset, type = "class")
ti.train.predictions = predict(ti.tree, type = "class")
ti.train.error = sum(ti.train.predictions != ti.trainset$survived)/nrow(ti.trainset)
ti.test.error = sum(ti.test.predictions != ti.testset$survived)/nrow(ti.testset)
ti.train.acc = accuracyFrom2x2ConfusionMatrix(table(ti.train.predictions, ti.trainset$survived))
ti.test.acc = accuracyFrom2x2ConfusionMatrix(table(ti.test.predictions, ti.testset$survived))
# Populate a row of the table... s now serves as batch number in col 1
ti.table[s*10 -10 +i,] = c(s, cpvector[i], ti.train.error, ti.test.error, ti.train.acc, ti.test.acc)
}
}
# Sanity check of ti table looks good – we have 10 batches of 10 with same cp series
dim(ti.table)
ti.table[1:4,]
ti.table[11:14,]
# Use existing table as template to create a table for the train and test errors avg by cp
ti.table.cpmeans = ti.table[1==0,]
# Empty table for cpmeans
dim(ti.table.cpmeans)
# Populate the means by cp
for (i in 1:length(cpvector)) {
ti.table.cpmeans[i,] = c(1, cpvector[i],
mean(ti.table[ti.table$cp == cpvector[i], 3]),
mean(ti.table[ti.table$cp == cpvector[i], 4]),
mean(ti.table[ti.table$cp == cpvector[i], 5]),
mean(ti.table[ti.table$cp == cpvector[i], 6]))
}
# Sanity check – series look smoother
ti.table.cpmeans
par(mfrow=c(2,1))
plot(ti.table.cpmeans$cp, ti.table.cpmeans$trainerr, pch=22, bg="blue", type="b",
main="Avg Training vs Test Error by cp param", xlab="cp param", ylab="Error", ylim=c(.15, .21))
points(ti.table.cpmeans$cp, ti.table.cpmeans$testerr, pch=22, bg="red", type="b")
plot(ti.table.cpmeans$cp, ti.table.cpmeans$trainacc, pch=22, bg="blue", type="b",
main="Avg Training vs Test Accuracy by cp param", xlab="cp param", ylab="Accuracy", ylim=c(.78, .86))
points(ti.table.cpmeans$cp, ti.table.cpmeans$testacc, pch=22, bg="red", type="b")
print('=============== END: jj73E ===============')
}
#
# jj73
# -------
jj73 = function() {
print('=============== MODEL: jj73 rpart trainV2 14VAR as factors ===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
ti.train$survived = as.factor(ti.train$survived)
ti.train$sibsp = as.factor(ti.train$sibsp)
ti.train$parch = as.factor(ti.train$parch)
ti.train$pclass = as.factor(ti.train$pclass)
ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
ti.train$agebin3 = as.factor(ti.train$agebin3)
ti.train$farebin3 = as.factor(ti.train$farebin3)
# Adjusted variable "parch" from 9 to 6 in rows 344 and 367 of testV2.csv to normalize train/test factor vars
# -------- PROPOSED Secret Sauce primer: ADD combo variables -----------
ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
colnames( ti.train)[14] = "sex_pclass"
ti.train[,14] = as.factor( ti.train[,14])
ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
colnames(ti.train)[15] = "sex_agebin3"
ti.train[,15] = as.factor(ti.train[,15])
ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
colnames(ti.train)[16] = "sex_farebin3"
ti.train[,16] = as.factor(ti.train[,16])
ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[17] = "sex_cabinX"
ti.train[,17] = as.factor(ti.train[,17])
ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[18] = "pclass_cabinX"
ti.train[,18] = as.factor(ti.train[,18])
ti.tree = rpart(survived ~ sibsp +pclass +cabincode +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX,
method = "class",
data = ti.train,
cp = .004, ## .015, ## cp = .015, # cp = .01,
minsplit = 10 )
par(mfrow=c(1,1))
print(plot(ti.tree))
print(text(ti.tree))
ti.train.pred = predict(ti.tree, type = "class")
ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
print(ti.train.tab)
print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
print('=============== MODEL: jj73 starting TEST predictions ===============')
# Load the testV2 TEST data -
# DATA PREP done in Excel
ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")
ti.test$sibsp = as.factor(ti.test$sibsp)
ti.test$parch = as.factor(ti.test$parch)
ti.test$pclass = as.factor(ti.test$pclass)
ti.test$caibinlen5 = as.factor(ti.test$caibinlen5)
ti.test$agebin3 = as.factor(ti.test$agebin3)
ti.test$farebin3 = as.factor(ti.test$farebin3)
# -------- PROPOSED Secret Sauce primer: ADD combo variables -----------
ti.test[,14] = paste(ti.test[,"sex"], ti.test[,"pclass"], sep="")
colnames( ti.test)[14] = "sex_pclass"
ti.test[,14] = as.factor( ti.test[,14])
ti.test[,15] = paste(ti.test[,"sex"], ti.test[,"agebin3"], sep="")
colnames(ti.test)[15] = "sex_agebin3"
ti.test[,15] = as.factor(ti.test[,15])
ti.test[,16] = paste(ti.test[,"sex"], ti.test[,"farebin3"], sep="")
colnames(ti.test)[16] = "sex_farebin3"
ti.test[,16] = as.factor(ti.test[,16])
ti.test[,17] = paste(ti.test[,"sex"], ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
colnames(ti.test)[17] = "sex_cabinX"
ti.test[,17] = as.factor(ti.test[,17])
ti.test[,18] = paste(ti.test[,"pclass"],ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
colnames(ti.test)[18] = "pclass_cabinX"
ti.test[,18] = as.factor(ti.test[,18])
ti.test.pred = predict(ti.tree, ti.test, type = "class")
# Save predictions
write.csv(ti.test.pred, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj73.csv",
row.names=FALSE, col.names=FALSE, quote = FALSE)
}
#
# jj72E - rpart with 14VAR COMBO features EVAL script - choose BEST cp value, then MAKE/use jj71 routine to create test predictions for SUBMIT
# ------- -------------------------
jj72E = function() {
# accuracy w 1/3 holdout train data : train test nleaf
# rpart 14VAR as-factors minsplit = 15 BEST cp = 0.0068 : 0.xxxxxxx 0.xxxxxxx 8 0.78469 << -- Kaggle public score as _jj72B (tweaked to get 8 leaves)
# rpart 14VAR as-factors minsplit = 20 BEST cp = 0.0200 : 0.8368687 0.8114478 4
# * 9VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3
# * 12VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3
print('=============== MODEL: jj72E rpart trainV2 12VAR variations (xxxx)===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
ti.train$survived = as.factor(ti.train$survived)
ti.train$sibsp = as.factor(ti.train$sibsp)
ti.train$parch = as.factor(ti.train$parch)
ti.train$pclass = as.factor(ti.train$pclass)
ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
ti.train$agebin3 = as.factor(ti.train$agebin3)
ti.train$farebin3 = as.factor(ti.train$farebin3)
# -------- PROPOSED Secret Sauce primer: ADD combo variables -----------
ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
colnames( ti.train)[14] = "sex_pclass"
ti.train[,14] = as.factor( ti.train[,14])
ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
colnames(ti.train)[15] = "sex_agebin3"
ti.train[,15] = as.factor(ti.train[,15])
ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
colnames(ti.train)[16] = "sex_farebin3"
ti.train[,16] = as.factor(ti.train[,16])
ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[17] = "sex_cabinX"
ti.train[,17] = as.factor(ti.train[,17])
ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[18] = "pclass_cabinX"
ti.train[,18] = as.factor(ti.train[,18])
# Build a table to collect data for the next two steps: colnames = batch, cp, trainerr, testerr
batch = numeric()
cp = numeric()
trainerr = numeric()
testerr = numeric()
trainacc = numeric()
testacc = numeric()
ti.table = data.frame(batch=batch, cp=cp, trainerr=trainerr, testerr=testerr, trainacc=trainacc, testacc=testacc)
# Make the seed vector to serve as unique sample seed and also as batch number 1-10
seedvector = 1:10
# cpvector = .0200
cpvector = c(.0400, .0350, .0300, .0250, .0200, .0150, .0100, .0080, .0060, .0040)
# Iterate over seedvector and cpvector to train and test batches of trees
for (s in 1:length(seedvector)) {
# Reset sample seed
set.seed(seedvector[s])
# Slice new training and test sets with same ratio
testids = sample(nrow(ti.train), nrow(ti.train)/3)
ti.testset = ti.train[testids, ]
ti.trainset = ti.train[ - testids, ]
# Iterate the cp values for this batch
for (i in 1:length(cpvector)) {
ti.tree = rpart(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3
+sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX +pclass_cabinX,
method = "class",
data = ti.trainset,
cp=cpvector[i],
minsplit = 20) ## 20 )
# if (i %in% c(1,3,5,7)) { print(plot(ti.tree)); print(text(ti.tree)); }
ti.test.predictions = predict(ti.tree, ti.testset, type = "class")
ti.train.predictions = predict(ti.tree, type = "class")
ti.train.error = sum(ti.train.predictions != ti.trainset$survived)/nrow(ti.trainset)
ti.test.error = sum(ti.test.predictions != ti.testset$survived)/nrow(ti.testset)
ti.train.acc = accuracyFrom2x2ConfusionMatrix(table(ti.train.predictions, ti.trainset$survived))
ti.test.acc = accuracyFrom2x2ConfusionMatrix(table(ti.test.predictions, ti.testset$survived))
# Populate a row of the table... s now serves as batch number in col 1
ti.table[s*10 -10 +i,] = c(s, cpvector[i], ti.train.error, ti.test.error, ti.train.acc, ti.test.acc)
}
}
# Sanity check of ti table looks good – we have 10 batches of 10 with same cp series
dim(ti.table)
ti.table[1:4,]
ti.table[11:14,]
# Use existing table as template to create a table for the train and test errors avg by cp
ti.table.cpmeans = ti.table[1==0,]
# Empty table for cpmeans
dim(ti.table.cpmeans)
# Populate the means by cp
for (i in 1:length(cpvector)) {
ti.table.cpmeans[i,] = c(1, cpvector[i],
mean(ti.table[ti.table$cp == cpvector[i], 3]),
mean(ti.table[ti.table$cp == cpvector[i], 4]),
mean(ti.table[ti.table$cp == cpvector[i], 5]),
mean(ti.table[ti.table$cp == cpvector[i], 6]))
}
# Sanity check – series look smoother
ti.table.cpmeans
par(mfrow=c(2,1))
plot(ti.table.cpmeans$cp, ti.table.cpmeans$trainerr, pch=22, bg="blue", type="b",
main="Avg Training vs Test Error by cp param", xlab="cp param", ylab="Error", ylim=c(.15, .21))
points(ti.table.cpmeans$cp, ti.table.cpmeans$testerr, pch=22, bg="red", type="b")
plot(ti.table.cpmeans$cp, ti.table.cpmeans$trainacc, pch=22, bg="blue", type="b",
main="Avg Training vs Test Accuracy by cp param", xlab="cp param", ylab="Accuracy", ylim=c(.78, .86))
points(ti.table.cpmeans$cp, ti.table.cpmeans$testacc, pch=22, bg="red", type="b")
print('=============== END: jj72E ===============')
}
#
# jj72
# -------
jj72 = function() {
print('=============== MODEL: jj72 rpart trainV2 14VAR as factors ===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
ti.train$survived = as.factor(ti.train$survived)
ti.train$sibsp = as.factor(ti.train$sibsp)
ti.train$parch = as.factor(ti.train$parch)
ti.train$pclass = as.factor(ti.train$pclass)
ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
ti.train$agebin3 = as.factor(ti.train$agebin3)
ti.train$farebin3 = as.factor(ti.train$farebin3)
# Adjusted variable "parch" from 9 to 6 in rows 344 and 367 of testV2.csv to normalize train/test factor vars
# -------- PROPOSED Secret Sauce primer: ADD combo variables -----------
ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
colnames( ti.train)[14] = "sex_pclass"
ti.train[,14] = as.factor( ti.train[,14])
ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
colnames(ti.train)[15] = "sex_agebin3"
ti.train[,15] = as.factor(ti.train[,15])
ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
colnames(ti.train)[16] = "sex_farebin3"
ti.train[,16] = as.factor(ti.train[,16])
ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[17] = "sex_cabinX"
ti.train[,17] = as.factor(ti.train[,17])
ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
colnames(ti.train)[18] = "pclass_cabinX"
ti.train[,18] = as.factor(ti.train[,18])
ti.tree = rpart(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3
+sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX +pclass_cabinX,
method = "class",
data = ti.train,
cp = .00580, ## cp = .015, # cp = .01,
minsplit = 20 )
par(mfrow=c(1,1))
print(plot(ti.tree))
print(text(ti.tree))
ti.train.pred = predict(ti.tree, type = "class")
ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
print(ti.train.tab)
print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
print('=============== MODEL: jj72 starting TEST predictions ===============')
# Load the testV2 TEST data -
# DATA PREP done in Excel
ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")
ti.test$sibsp = as.factor(ti.test$sibsp)
ti.test$parch = as.factor(ti.test$parch)
ti.test$pclass = as.factor(ti.test$pclass)
ti.test$caibinlen5 = as.factor(ti.test$caibinlen5)
ti.test$agebin3 = as.factor(ti.test$agebin3)
ti.test$farebin3 = as.factor(ti.test$farebin3)
# -------- PROPOSED Secret Sauce primer: ADD combo variables -----------
ti.test[,14] = paste(ti.test[,"sex"], ti.test[,"pclass"], sep="")
colnames( ti.test)[14] = "sex_pclass"
ti.test[,14] = as.factor( ti.test[,14])
ti.test[,15] = paste(ti.test[,"sex"], ti.test[,"agebin3"], sep="")
colnames(ti.test)[15] = "sex_agebin3"
ti.test[,15] = as.factor(ti.test[,15])
ti.test[,16] = paste(ti.test[,"sex"], ti.test[,"farebin3"], sep="")
colnames(ti.test)[16] = "sex_farebin3"
ti.test[,16] = as.factor(ti.test[,16])
ti.test[,17] = paste(ti.test[,"sex"], ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
colnames(ti.test)[17] = "sex_cabinX"
ti.test[,17] = as.factor(ti.test[,17])
ti.test[,18] = paste(ti.test[,"pclass"],ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
colnames(ti.test)[18] = "pclass_cabinX"
ti.test[,18] = as.factor(ti.test[,18])
ti.test.pred = predict(ti.tree, ti.test, type = "class")
# Save predictions
write.csv(ti.test.pred, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj72C.csv",
row.names=FALSE, col.names=FALSE, quote = FALSE)
}
#
# jj71E - rpart with 12VAR COMBO features EVAL script - choose BEST cp value, then MAKE/use jj71 routine to create test predictions for SUBMIT
# ------- -------------------------
jj71E = function() {
# accuracy w 1/3 holdout train data : train test nleaf
# rpart 12VAR as-factors minsplit = 20 BEST cp = 0.02 : 0.8350168 0.8202020 4
# rpart 12VAR as-factors minsplit = 15 BEST cp = 0.015 : 0.8441077 0.8208754 4 0.79904 << -- Kaggle public score as _jj71 new BEST! 11/27
# rpart 12VAR as-factors minsplit = 15 BEST cp = 0.01 : 0.8500000 0.8175084 8
# rpart 12VAR as-factors minsplit = 15 BEST cp = 0.0068 : 0.8434343 0.8619529 8 0.78469 << -- Kaggle public score as _jj71A (tweaked to get 8 leaves)
# * 9VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3
# * 12VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3
print('=============== MODEL: jj71E rpart trainV2 12VAR variations (xxxx)===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
ti.train$survived = as.factor(ti.train$survived)
ti.train$sibsp = as.factor(ti.train$sibsp)
ti.train$parch = as.factor(ti.train$parch)
ti.train$pclass = as.factor(ti.train$pclass)
ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
ti.train$agebin3 = as.factor(ti.train$agebin3)
ti.train$farebin3 = as.factor(ti.train$farebin3)
# -------- PROPOSED Secret Sauce primer: ADD combo variables -----------
ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
colnames( ti.train)[14] = "sex_pclass"
ti.train[,14] = as.factor( ti.train[,14])
ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
colnames(ti.train)[15] = "sex_agebin3"
ti.train[,15] = as.factor(ti.train[,15])
ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
colnames(ti.train)[16] = "sex_farebin3"
ti.train[,16] = as.factor(ti.train[,16])
# Build a table to collect data for the next two steps: colnames = batch, cp, trainerr, testerr
batch = numeric()
cp = numeric()
trainerr = numeric()
testerr = numeric()
trainacc = numeric()
testacc = numeric()
ti.table = data.frame(batch=batch, cp=cp, trainerr=trainerr, testerr=testerr, trainacc=trainacc, testacc=testacc)
# Make the seed vector to serve as unique sample seed and also as batch number 1-10
seedvector = 1:10
cpvector = .0200
# cpvector = c(.0400, .0350, .0300, .0250, .0200, .0150, .0100, .0080, .0060, .0040)
# Iterate over seedvector and cpvector to train and test batches of trees
for (s in 1:length(seedvector)) {
# Reset sample seed
set.seed(seedvector[s])
# Slice new training and test sets with same ratio
testids = sample(nrow(ti.train), nrow(ti.train)/3)
ti.testset = ti.train[testids, ]
ti.trainset = ti.train[ - testids, ]
# Iterate the cp values for this batch
for (i in 1:length(cpvector)) {
ti.tree = rpart(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3,
method = "class",
data = ti.trainset,
cp=cpvector[i],
minsplit = 20 )
# if (i %in% c(1,3,5,7)) { print(plot(ti.tree)); print(text(ti.tree)); }
ti.test.predictions = predict(ti.tree, ti.testset, type = "class")
ti.train.predictions = predict(ti.tree, type = "class")
ti.train.error = sum(ti.train.predictions != ti.trainset$survived)/nrow(ti.trainset)
ti.test.error = sum(ti.test.predictions != ti.testset$survived)/nrow(ti.testset)
ti.train.acc = accuracyFrom2x2ConfusionMatrix(table(ti.train.predictions, ti.trainset$survived))
ti.test.acc = accuracyFrom2x2ConfusionMatrix(table(ti.test.predictions, ti.testset$survived))
# Populate a row of the table... s now serves as batch number in col 1
ti.table[s*length(cpvector) -length(cpvector) +i,] = c(s, cpvector[i], ti.train.error, ti.test.error, ti.train.acc, ti.test.acc)
}
}
# Sanity check of ti table looks good – we have 10 batches of 10 with same cp series
dim(ti.table)
ti.table[1:4,]
ti.table[11:14,]
# Use existing table as template to create a table for the train and test errors avg by cp
ti.table.cpmeans = ti.table[1==0,]
# Empty table for cpmeans
dim(ti.table.cpmeans)
# Populate the means by cp
for (i in 1:length(cpvector)) {
ti.table.cpmeans[i,] = c(1, cpvector[i],
mean(ti.table[ti.table$cp == cpvector[i], 3]),
mean(ti.table[ti.table$cp == cpvector[i], 4]),
mean(ti.table[ti.table$cp == cpvector[i], 5]),
mean(ti.table[ti.table$cp == cpvector[i], 6]))
}
# Sanity check – series look smoother
ti.table.cpmeans
par(mfrow=c(2,1))
plot(ti.table.cpmeans$cp, ti.table.cpmeans$trainerr, pch=22, bg="blue", type="b",
main="Avg Training vs Test Error by cp param", xlab="cp param", ylab="Error", ylim=c(.15, .21))
points(ti.table.cpmeans$cp, ti.table.cpmeans$testerr, pch=22, bg="red", type="b")
plot(ti.table.cpmeans$cp, ti.table.cpmeans$trainacc, pch=22, bg="blue", type="b",
main="Avg Training vs Test Accuracy by cp param", xlab="cp param", ylab="Accuracy", ylim=c(.78, .86))
points(ti.table.cpmeans$cp, ti.table.cpmeans$testacc, pch=22, bg="red", type="b")
print('=============== END: jj71E ===============')
}
#
# jj71
# -------
jj71 = function() {
print('=============== MODEL: jj71 rpart trainV2 12VAR as factors ===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
ti.train$survived = as.factor(ti.train$survived)
ti.train$sibsp = as.factor(ti.train$sibsp)
ti.train$parch = as.factor(ti.train$parch)
ti.train$pclass = as.factor(ti.train$pclass)
ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
ti.train$agebin3 = as.factor(ti.train$agebin3)
ti.train$farebin3 = as.factor(ti.train$farebin3)
# Adjusted variable "parch" from 9 to 6 in rows 344 and 367 of testV2.csv to normalize train/test factor vars
# -------- PROPOSED Secret Sauce primer: ADD combo variables -----------
ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
colnames( ti.train)[14] = "sex_pclass"
ti.train[,14] = as.factor( ti.train[,14])
ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
colnames(ti.train)[15] = "sex_agebin3"
ti.train[,15] = as.factor(ti.train[,15])
ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
colnames(ti.train)[16] = "sex_farebin3"
ti.train[,16] = as.factor(ti.train[,16])
ti.tree = rpart(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3,
method = "class",
data = ti.train,
cp = .0068, # cp = .01,
minsplit = 15 )
par(mfrow=c(1,1))
print(plot(ti.tree))
print(text(ti.tree))
ti.train.pred = predict(ti.tree, type = "class")
ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
print(ti.train.tab)
print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
print('=============== MODEL: jj71 starting TEST predictions ===============')
# Load the testV2 TEST data -
# DATA PREP done in Excel
ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")
ti.test$sibsp = as.factor(ti.test$sibsp)
ti.test$parch = as.factor(ti.test$parch)
ti.test$pclass = as.factor(ti.test$pclass)
ti.test$caibinlen5 = as.factor(ti.test$caibinlen5)
ti.test$agebin3 = as.factor(ti.test$agebin3)
ti.test$farebin3 = as.factor(ti.test$farebin3)
# -------- PROPOSED Secret Sauce primer: ADD combo variables -----------
ti.test[,14] = paste(ti.test[,"sex"], ti.test[,"pclass"], sep="")
colnames( ti.test)[14] = "sex_pclass"
ti.test[,14] = as.factor( ti.test[,14])
ti.test[,15] = paste(ti.test[,"sex"], ti.test[,"agebin3"], sep="")
colnames(ti.test)[15] = "sex_agebin3"
ti.test[,15] = as.factor(ti.test[,15])
ti.test[,16] = paste(ti.test[,"sex"], ti.test[,"farebin3"], sep="")
colnames(ti.test)[16] = "sex_farebin3"
ti.test[,16] = as.factor(ti.test[,16])
ti.test.pred = predict(ti.tree, ti.test, type = "class")
# Save predictions
write.csv(ti.test.pred, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj71A.csv",
row.names=FALSE, col.names=FALSE, quote = FALSE)
}
#
# jj7E - rpart EVAL script - choose BEST cp value, then use jj7 routine to create test predictions for SUBMIT
# -------
jj7E = function() {
# accuracy w 1/3 holdout train data : train test nleaf
# rpart as-factors minsplit = 5 BEST cp = 0.004 : 0.8819 0.8242 23 0.76077 << -- Kaggle public score as _jj7B WAY OVERFIT!
# rpart as-factors minsplit = 10 BEST cp = 0.006 : 0.8641 0.8222 13
# rpart as-factors minsplit = 20 BEST cp = 0.02 : 0.8197 0.8118 4 0.78469 << -- Kaggle public score as _jj7
# rpart as-factors minsplit = 20 BEST cp = 0.01 : 0.8398 0.8087 8
# rpart as-factors minsplit = 30 BEST cp = 0.025 : 0.8156 0.8107 4
# rpart as-numerics minsplit = 20 BEST cp = 0.01 : 0.8107 0.8061 8
# rpart as-numerics minsplit = 10 BEST cp = 0.015 : 0.8476 0.8087 12 (not the BEST test acc, but narrower train-test gap)
print('=============== MODEL: jj7E rpart trainV2 variations (xxxx)===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
ti.train$survived = as.factor(ti.train$survived)
ti.train$sibsp = as.factor(ti.train$sibsp)
ti.train$parch = as.factor(ti.train$parch)
ti.train$pclass = as.factor(ti.train$pclass)
ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
ti.train$agebin3 = as.factor(ti.train$agebin3)
ti.train$farebin3 = as.factor(ti.train$farebin3)
# Build a table to collect data for the next two steps: colnames = batch, cp, trainerr, testerr
batch = numeric()
cp = numeric()
trainerr = numeric()
testerr = numeric()
trainacc = numeric()
testacc = numeric()
ti.table = data.frame(batch=batch, cp=cp, trainerr=trainerr, testerr=testerr, trainacc=trainacc, testacc=testacc)
# Make the seed vector to serve as unique sample seed and also as batch number 1-10
seedvector = 1:10
# cpvector = .01
cpvector = c(.0400, .0350, .0300, .0250, .0200, .0150, .0100, .0080, .0060, .0040)
# Iterate over seedvector and cpvector to train and test batches of trees
for (s in 1:length(seedvector)) {
# Reset sample seed
set.seed(seedvector[s])
# Slice new training and test sets with same ratio
testids = sample(nrow(ti.train), nrow(ti.train)/3)
ti.testset = ti.train[testids, ]
ti.trainset = ti.train[ - testids, ]
# Iterate the cp values for this batch
for (i in 1:length(cpvector)) {
ti.tree = rpart(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3,
method = "class",
data = ti.trainset,
cp=cpvector[i],
minsplit = 20 )
# if (i %in% c(1,3,5,7)) { print(plot(ti.tree)); print(text(ti.tree)); }
ti.test.predictions = predict(ti.tree, ti.testset, type = "class")
ti.train.predictions = predict(ti.tree, type = "class")
ti.train.error = sum(ti.train.predictions != ti.trainset$survived)/nrow(ti.trainset)
ti.test.error = sum(ti.test.predictions != ti.testset$survived)/nrow(ti.testset)
ti.train.acc = accuracyFrom2x2ConfusionMatrix(table(ti.train.predictions, ti.trainset$survived))
ti.test.acc = accuracyFrom2x2ConfusionMatrix(table(ti.test.predictions, ti.testset$survived))
# Populate a row of the table... s now serves as batch number in col 1
ti.table[s*10 -10 +i,] = c(s, cpvector[i], ti.train.error, ti.test.error, ti.train.acc, ti.test.acc)
}
}
# Sanity check of ti table looks good – we have 10 batches of 10 with same cp series
dim(ti.table)
ti.table[1:4,]
ti.table[11:14,]
# Use existing table as template to create a table for the train and test errors avg by cp
ti.table.cpmeans = ti.table[1==0,]
# Empty table for cpmeans
dim(ti.table.cpmeans)
# Populate the means by cp
for (i in 1:length(cpvector)) {
ti.table.cpmeans[i,] = c(1, cpvector[i],
mean(ti.table[ti.table$cp == cpvector[i], 3]),
mean(ti.table[ti.table$cp == cpvector[i], 4]),
mean(ti.table[ti.table$cp == cpvector[i], 5]),
mean(ti.table[ti.table$cp == cpvector[i], 6]))
}
# Sanity check – series look smoother
ti.table.cpmeans
par(mfrow=c(2,1))
plot(ti.table.cpmeans$cp, ti.table.cpmeans$trainerr, pch=22, bg="blue", type="b",
main="Avg Training vs Test Error by cp param", xlab="cp param", ylab="Error", ylim=c(.15, .21))
points(ti.table.cpmeans$cp, ti.table.cpmeans$testerr, pch=22, bg="red", type="b")
plot(ti.table.cpmeans$cp, ti.table.cpmeans$trainacc, pch=22, bg="blue", type="b",
main="Avg Training vs Test Accuracy by cp param", xlab="cp param", ylab="Accuracy", ylim=c(.78, .86))
points(ti.table.cpmeans$cp, ti.table.cpmeans$testacc, pch=22, bg="red", type="b")
print('=============== END: jj7E ===============')
}
#
# jj7
# -------
jj7 = function() {
print('=============== MODEL: jj7 rpart trainV2 as factors (8182)===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
ti.train$survived = as.factor(ti.train$survived)
ti.train$sibsp = as.factor(ti.train$sibsp)
ti.train$parch = as.factor(ti.train$parch)
ti.train$pclass = as.factor(ti.train$pclass)
ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
ti.train$agebin3 = as.factor(ti.train$agebin3)
ti.train$farebin3 = as.factor(ti.train$farebin3)
# Adjusted variable "parch" from 9 to 6 in rows 344 and 367 of testV2.csv to normalize train/test factor vars
ti.tree = rpart(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3,
method = "class", data = ti.train, minsplit = 20, cp = 0.01)
par(mfrow=c(1,1))
print(plot(ti.tree))
print(text(ti.tree))
ti.train.pred = predict(ti.tree, type = "class")
ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
print(ti.train.tab)
print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
print('=============== MODEL: jj7 starting TEST predictions ===============')
# Load the testV2 TEST data -
# DATA PREP done in Excel
ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")
ti.test$sibsp = as.factor(ti.test$sibsp)
ti.test$parch = as.factor(ti.test$parch)
ti.test$pclass = as.factor(ti.test$pclass)
ti.test$caibinlen5 = as.factor(ti.test$caibinlen5)
ti.test$agebin3 = as.factor(ti.test$agebin3)
ti.test$farebin3 = as.factor(ti.test$farebin3)
ti.test.pred = predict(ti.tree, ti.test, type = "class")
# Save predictions
write.csv(ti.test.pred, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj7B.csv",
row.names=FALSE, col.names=FALSE, quote = FALSE)
}
#
# jj6
# -------
jj6 = function() {
print('=============== MODEL: jj6 randomForest trainV2 ===============')
# Load the trainV2 TRAINING data -
# DATA PREP done in Excel
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")
# == Train and eval ==
# first set response var to a factor so randomForest does classification instead of regression
ti.train$survived = as.factor(ti.train$survived)
library(randomForest)
ti.rf = randomForest(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3,
method = "class",
data = ti.train,
ntree = 5000)
# print(plot(ti.rf))
ti.train.pred = predict(ti.rf)
ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
print(ti.train.tab)
print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
print('=============== MODEL: jj6 starting TEST predictions ===============')
# Load the testV2 TEST data -
# DATA PREP done in Excel
ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")
print("test predictions being output to ti.test.pred...")
ti.test.pred = predict(ti.rf,
ti.test[, c("pclass", "sex", "sibsp", "parch", "embarked", "cabincode", "caibinlen5", "agebin3", "farebin3")] )
# Save predictions
write.csv(ti.test.pred, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj6.csv",
row.names=FALSE, col.names=FALSE, quote = FALSE)
}
#
# jj5
# -------
jj5 = function() {
print('=============== MODEL: jj5 randomForest 5 orig + 3 derived ===============')
# Load the TRAINING data
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/train.csv", header=TRUE, sep=",")
# remove name col
ti.train2 = ti.train[, ! colnames(ti.train) %in% "name"]
# assign missing embarked values to most common factor
# ti.train2[ti.train2$embarked=="", 10] = "S"
# add derived cols
ti.train2[, "cabinlisted"] = numeric()
ti.train2[, "agebin"] = numeric()
ti.train2[, "farebin"] = numeric()
# assign cabinlisted col
ti.train2[ti.train2$cabin == "", "cabinlisted"] = 0
ti.train2[ti.train2$cabin != "", "cabinlisted"] = 1
# assign agebin with binned values based on data inspection
for (i in 1:length(ti.train2[,1])) {
if(is.na(ti.train2[i, "age"])) { ti.train2[i, "agebin"] = 50 }
else if(ti.train2[i, "age"] <= 9) { ti.train2[i, "agebin"] = 9 }
else if(ti.train2[i, "age"] <= 24) { ti.train2[i, "agebin"] = 24 }
else if(ti.train2[i, "age"] <= 50) { ti.train2[i, "agebin"] = 50 }
else { ti.train2[i, "agebin"] = 99 }
}
# assign farebin with binned values based on data inspection
for (i in 1:length(ti.train2[,1])) {
if(is.na(ti.train2[i, "fare"])) { ti.train2[i, "farebin"] = 2600 }
else if(ti.train2[i, "fare"] <= 7.75) { ti.train2[i, "farebin"] = 775 }
else if(ti.train2[i, "fare"] <= 26.0) { ti.train2[i, "farebin"] = 2600 }
else { ti.train2[i, "farebin"] = 9900 }
}
# == Train and eval ==
# first set response var to a factor so randomForest does classification instead of regression
ti.train2$survived = as.factor(ti.train2$survived)
library(randomForest)
ti.rf = randomForest(survived ~ sex+sibsp+parch+pclass+embarked+cabinlisted+farebin+agebin,
method = "class",
data = ti.train2, ntree = 500)
print(plot(ti.rf))
ti.train.pred = predict(ti.rf)
ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
print(ti.train.tab)
print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
print('=============== MODEL: jj5 starting TEST predictions ===============')
# Load the TEST data
ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/test.csv", header=TRUE, sep=",")
# remove name col
ti.test2 = ti.test[, ! colnames(ti.test) %in% "name"]
# assign missing embarked values to most common factor
# ti.test2[ti.test2$embarked=="", 9] = "S"
# add derived cols
ti.test2[, "cabinlisted"] = numeric()
ti.test2[, "agebin"] = numeric()
ti.test2[, "farebin"] = numeric()
# assign cabinlisted col
ti.test2[ti.test2$cabin == "", "cabinlisted"] = 0
ti.test2[ti.test2$cabin != "", "cabinlisted"] = 1
# assign agebin with binned values based on data inspection
for (i in 1:length(ti.test2[,1])) {
if(is.na(ti.test2[i, "age"])) { ti.test2[i, "agebin"] = 50 }
else if(ti.test2[i, "age"] <= 9) { ti.test2[i, "agebin"] = 9 }
else if(ti.test2[i, "age"] <= 24) { ti.test2[i, "agebin"] = 24 }
else if(ti.test2[i, "age"] <= 50) { ti.test2[i, "agebin"] = 50 }
else { ti.test2[i, "agebin"] = 99 }
}
# assign farebin with binned values based on data inspection
for (i in 1:length(ti.test2[,1])) {
if(is.na(ti.test2[i, "fare"])) { ti.test2[i, "farebin"] = 2600 }
else if(ti.test2[i, "fare"] <= 7.75) { ti.test2[i, "farebin"] = 775 }
else if(ti.test2[i, "fare"] <= 26.0) { ti.test2[i, "farebin"] = 2600 }
else { ti.test2[i, "farebin"] = 9900 }
}
print("test predictions being output to ti.test.pred...")
ti.test.pred = predict(ti.rf,
ti.test2[, c("pclass", "sex", "sibsp", "parch", "embarked", "cabinlisted", "farebin", "agebin")] )
# Save predictions
write.csv(ti.test.pred, "file://C:/Users/Joe/My Documents/Titanic/testpredictions.csv",
row.names=FALSE, col.names=FALSE, quote = FALSE)
}
#
# jj4
# -------
jj4 = function() {
print('=============== MODEL: jj4 rpart 5 orig + 3 derived ===============')
# Load the data
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/train.csv", header=TRUE, sep=",")
# remove name col
ti.train2 = ti.train[, ! colnames(ti.train) %in% "name"]
# add derived cols
ti.train2[, "cabinlisted"] = numeric()
ti.train2[, "agebin"] = numeric()
ti.train2[, "farebin"] = numeric()
# assign cabinlisted col
ti.train2[ti.train2$cabin == "", "cabinlisted"] = 0
ti.train2[ti.train2$cabin != "", "cabinlisted"] = 1
# assign agebin with binned values based on data inspection
for (i in 1:length(ti.train2[,1])) {
if(is.na(ti.train2[i, "age"])) { ti.train2[i, "agebin"] = 50 }
else if(ti.train2[i, "age"] <= 9) { ti.train2[i, "agebin"] = 9 }
else if(ti.train2[i, "age"] <= 24) { ti.train2[i, "agebin"] = 24 }
else if(ti.train2[i, "age"] <= 50) { ti.train2[i, "agebin"] = 50 }
else { ti.train2[i, "agebin"] = 99 }
}
# assign farebin with binned values based on data inspection
for (i in 1:length(ti.train2[,1])) {
if(is.na(ti.train2[i, "fare"])) { ti.train2[i, "farebin"] = 2600 }
else if(ti.train2[i, "fare"] <= 7.75) { ti.train2[i, "farebin"] = 775 }
else if(ti.train2[i, "fare"] <= 26.0) { ti.train2[i, "farebin"] = 2600 }
else { ti.train2[i, "farebin"] = 9900 }
}
# Train and eval
ti.tree = rpart(survived ~ sex+sibsp+parch+pclass+embarked+cabinlisted+farebin+agebin, method = "class", data = ti.train2)
print(plot(ti.tree))
print(text(ti.tree))
ti.train.pred = predict(ti.tree, type = "class")
ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
print(ti.train.tab)
print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
}
#
# jj3
# -------
jj3 = function() {
print('=============== MODEL: jj3 rpart sex+sibsp+parch ===============')
# Load the data
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/train.csv", header=TRUE, sep=",")
ti.tree = rpart(survived ~ sex+sibsp+parch, method = "class", data = ti.train)
print(plot(ti.tree))
print(text(ti.tree))
ti.train.pred = predict(ti.tree, type = "class")
ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
print(ti.train.tab)
print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
}
#
# jj2
# -------
jj2 = function() {
print('=============== MODEL: jj2 rpart sex+sibsp===============')
# Load the data
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/train.csv", header=TRUE, sep=",")
ti.tree = rpart(survived ~ sex+sibsp, method = "class", data = ti.train)
print(plot(ti.tree))
print(text(ti.tree))
ti.train.pred = predict(ti.tree, type = "class")
ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
print(ti.train.tab)
print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
}
#
# jj1
# -------
jj1 = function() {
print('=============== MODEL: jj1 rpart sex+pclass ===============')
# Load the data
ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/train.csv", header=TRUE, sep=",")
ti.tree = rpart(survived ~ sex+pclass, method = "class", data = ti.train)
ti.train.pred = predict(ti.tree, type = "class")
ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
print(ti.train.tab)
print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
}
#
# accuracyFrom2x2ConfusionMatrix
# -------------------------------
accuracyFrom2x2ConfusionMatrix = function(tb) {
# Check size of input
if(length(dim(tb)) != 2) return(-1)
if(dim(tb)[1] != 2) return(-1)
if(dim(tb)[2] != 2) return(-1)
# Return accuracy calc
return((tb[1,1] + tb[2,2]) / sum(tb))
}