Exploration and Prediction Models

Kaggle Titanic Survival
Exploration and Prediction Models

Following is a page snapshot during the data mining competition hosted by kaggle.com for my STATS202 course at Stanford. My explorations and models in R follow.
This was my first Kaggle contest in 2012 during my first graduate course in Data Mining. I learned that you can easily overfit the Kaggle public leaderboard! I felt accomplished after breaking the Top 10, but I was fooling myself. I knew how to avoid overfitting a test set when tuning a model, but hadn't been exposed yet to bias-variance tradeoff as a way of thinking about models.
Kaggle keeps a private set of holdout data, and prediction accuracy against that holdout determines the final rankings. I ended up tweaking the bias down to raise my public score, while pushing the variance up when exposed to the unseen private holdout data. This newbie trap is explained well here.
Kaggle Titanic Leaderboard

# TEST data for generating predictions to submit: 
#
# ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/test.csv", header=TRUE, sep=",")



# IDEAS  -- TRY GLM and SVM

# IDEAS  -- TRY Family name groupings!



#
# jj11E - svm EVAL script - choose BEST param values, then use jj11 routine to create test predictions for SUBMIT
# -------
jj11E = function() {

  # accuracy w 1/3 holdout train data            :       train     test
  # --------------------------------------------         --------- ---------

  # svm 8SVAR as-factors  radial gamma=.09 tolerance=df : 0.8361953 0.8171717

  # svm 14VAR as-factors  radial gamma=.07 tolerance=df : 0.8378788 0.8259259 0.77990 << -- Kaggle public score as _jj11H matches _jj11F

  # svm 14VAR as-factors  radial gamma=.1 tolerance=.3  : 0.8552189 0.8289562 
  # svm 14VAR as-factors  radial gamma=.1 tolerance=df  : 0.8538721 0.8259259 
  # svm 14VAR as-factors  radial gamma=.06 tolerance=df : 0.8350168 0.8235690 
  # svm 14VAR as-factors  radial gamma=.1 tolerance=df  : 0.xxxxxxx 0.xxxxxxx 0.77990 << -- Kaggle public score as _jj11F (BAD ver of 14VAR)

  # svm 12VAR as-factors  radial gamma=.2 tolerance=.4  : 0.8760943 0.8282828 
  # svm 12VAR as-factors  radial gamma=.2 tolerance=df  : 0.8771044 0.8276094
  # svm 12VAR as-factors  radial gamma=.1 tolerance=df  : 0.8484848 0.8323232 0.78469 << -- Kaggle public score as _jj11C

  # svm 12VAR as-factors  polynomial gamma=.1           : 0.8664983 0.8255892
  # svm 12VAR as-factors  polynomial gamma=.1 degree=2  : 0.8505051 0.8313131 0.77512 << -- Kaggle public score as _jj11D

  # svm 9VAR  as-factors  linear gamma=.1               : 0.8185185 0.8026936 NOTE: gamma not used by linear kernel

  # svm 9VAR  as-factors  polynomial gamma=.2           : 0.8816498 0.8171717
  # svm 9VAR  as-factors  polynomial gamma=.2 degree=2  : 0.8631313 0.8202020 0.77990 << -- Kaggle public score as _jj11A

  # svm 9VAR  as-factors  radial gamma=.2 tolerance=.7  : 0.8595960 0.8249158 0.79426 << -- Kaggle public score as _jj11B
  # svm 9VAR  as-factors  radial gamma=.2               : 0.8580808 0.8225589

  # svm 9VAR  as-numerics linear gamma=.1               : 0.7888889 0.783165  
  # svm 9VAR  as-numerics linear scale=F gamma=.1       : 0.7883838 0.7835017

  # * 9VAR  = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3
  # * 12VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3
  # * 14VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX
  # * 8SVAR = sibsp +pclass +cabincode +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX
  #           (8 vars selected BASED on variable influence calculated by gbm)


  print('=============== MODEL: jj11E svm trainV2 variations (xxxx)===============')  

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")

  ti.train$survived = as.factor(ti.train$survived)
  ti.train$sibsp = as.factor(ti.train$sibsp) 
  ti.train$parch = as.factor(ti.train$parch)
  ti.train$pclass = as.factor(ti.train$pclass)
  ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
  ti.train$agebin3 = as.factor(ti.train$agebin3)
  ti.train$farebin3 = as.factor(ti.train$farebin3)

  # -------- PROPOSED Secret Sauce primer: ADD combo variables to get from 9VAR to 12VAR -----------

  ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
  colnames( ti.train)[14] = "sex_pclass"
  ti.train[,14] = as.factor( ti.train[,14])

  ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
  colnames(ti.train)[15] = "sex_agebin3"
  ti.train[,15] = as.factor(ti.train[,15])

  ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
  colnames(ti.train)[16] = "sex_farebin3"
  ti.train[,16] = as.factor(ti.train[,16])

  ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[17] = "sex_cabinX"
  ti.train[,17] = as.factor(ti.train[,17])

  ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[18] = "pclass_cabinX"
  ti.train[,18] = as.factor(ti.train[,18])

  # Build a table to collect data for the next two steps: colnames = batch, gam, trainerr, testerr
  batch = numeric()
  gam = numeric()
  trainerr = numeric()
  testerr = numeric()
  trainacc = numeric()
  testacc = numeric()
  ti.table = data.frame(batch=batch, gam=gam, trainerr=trainerr, testerr=testerr, trainacc=trainacc, testacc=testacc)

  # Make the seed vector to serve as unique sample seed and also as batch number 1-10
  seedvector = 1:10

  library(e1071)
  par(mfrow=c(2,3))

  #### gamvector = c(.0004, .0006, .0008, .001, .0012, .0014, .0016, .002, .0025, .003)  ##  c(.1, .2)

  gamvector = c(.03, .04, .05, .06, .07, .08, .09, .1, .12, .2)
  #### gamvector = c(.1, .3, .5, .7, .9, 1.2, 1.5, 2.0, 3.0, 4.0)

  # Iterate over seedvector and gamvector to train and test batches of trees
  for (s in 1:length(seedvector)) {
    # Reset sample seed
    set.seed(seedvector[s])
    # Slice new training and test sets with same ratio
    testids = sample(nrow(ti.train), nrow(ti.train)/3)
    ti.testset = ti.train[testids, ]
    ti.trainset = ti.train[ - testids, ]
    # Iterate the gam values for this batch
    for (i in 1:length(gamvector)) {

      # 9VAR version:
      # ti.svm = svm(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3, 

      # 14VAR version:
      #ti.svm = svm(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 
      #                              +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX +pclass_cabinX, 

      # 8SVAR version:
      ti.svm = svm(survived ~ sibsp +pclass +cabincode +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX, 
        data = ti.trainset,
        kernel = "radial",        ## "polynomial", ## "sigmoid", ## "linear", ## 
        gamma= gamvector[i])      ## .1) ## .1, 
        # degree= gamvector[i])
        # tolerance= gamvector[i])

      ti.train.predictions = predict(ti.svm)

      # 9VAR version:
      # ti.test.predictions = predict(ti.svm, ti.testset[, c("sex", "sibsp", "parch", "pclass", 
      #   "embarked", "cabincode", "caibinlen5", "agebin3", "farebin3")])

      # 14VAR version:
      # ti.test.predictions = predict(ti.svm, ti.testset[, c("sex", "sibsp", "parch", "pclass", 
      #   "embarked", "cabincode", "caibinlen5", "agebin3", "farebin3", 
      #   "sex_pclass", "sex_agebin3", "sex_farebin3", "sex_cabinX", "pclass_cabinX" )])

      # 8SVAR version:
      ti.test.predictions = predict(ti.svm, ti.testset[, c("sibsp", "pclass", 
         "cabincode", "sex_pclass", "sex_agebin3", "sex_farebin3", "sex_cabinX", "pclass_cabinX" )])

      ti.train.error = sum(ti.train.predictions != ti.trainset$survived)/nrow(ti.trainset)
      ti.test.error = sum(ti.test.predictions != ti.testset$survived)/nrow(ti.testset)
      ti.train.acc = accuracyFrom2x2ConfusionMatrix(table(ti.train.predictions, ti.trainset$survived))
      ti.test.acc = accuracyFrom2x2ConfusionMatrix(table(ti.test.predictions, ti.testset$survived))
      # Populate a row of the table... s now serves as batch number in col 1
      ti.table[s*length(gamvector) -length(gamvector) +i,] = 
        c(s, gamvector[i], ti.train.error, ti.test.error, ti.train.acc, ti.test.acc)
    }
  }
  # Sanity check of ti table looks good – we have 10 batches of 10 with same gamma series
  dim(ti.table)
  ti.table[1:10,]

  # Use existing table as template to create a table for the train and test errors avg by gamma
  ti.table.gammeans = ti.table[1==0,] 
  # Empty table for gammeans
  dim(ti.table.gammeans)

  # Populate the means by gam
  for (i in 1:length(gamvector)) {
    ti.table.gammeans[i,] = c(1, gamvector[i],
    mean(ti.table[ti.table$gam == gamvector[i], 3]),
    mean(ti.table[ti.table$gam == gamvector[i], 4]),
    mean(ti.table[ti.table$gam == gamvector[i], 5]),
    mean(ti.table[ti.table$gam == gamvector[i], 6]))
  }

  # Sanity check – series look smoother
  ti.table.gammeans

  par(mfrow=c(2,1))

  plot(ti.table.gammeans$gam, ti.table.gammeans$trainerr, pch=22, bg="blue", type="b",
    main="Avg Training vs Test Error by gam param", xlab="gam param", ylab="Error", ylim=c(.10, .21))
  points(ti.table.gammeans$gam, ti.table.gammeans$testerr, pch=22, bg="red", type="b")

  plot(ti.table.gammeans$gam, ti.table.gammeans$trainacc, pch=22, bg="blue", type="b",
    main="Avg Training vs Test Accuracy by gam param", xlab="gam param", ylab="Accuracy", ylim=c(.78, .90))
  points(ti.table.gammeans$gam, ti.table.gammeans$testacc, pch=22, bg="red", type="b")

  print('=============== DONE: function jj11E =================')  


}





#
# jj11
# -------
jj11 = function() {

  # SEE function jj11E for SUBMITTED test predictions using svm...

  print('=============== MODEL: jj11 svm trainV2 as factors ===============')  

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")

  ti.train$survived = as.factor(ti.train$survived)
  ti.train$sibsp = as.factor(ti.train$sibsp)
  ti.train$parch = as.factor(ti.train$parch)
  ti.train$pclass = as.factor(ti.train$pclass)
  ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
  ti.train$agebin3 = as.factor(ti.train$agebin3)
  ti.train$farebin3 = as.factor(ti.train$farebin3)

  # -------- PROPOSED Secret Sauce primer: ADD combo variables to get from 9VAR to 12VAR -----------

  ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
  colnames( ti.train)[14] = "sex_pclass"
  ti.train[,14] = as.factor( ti.train[,14])

  ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
  colnames(ti.train)[15] = "sex_agebin3"
  ti.train[,15] = as.factor(ti.train[,15])

  ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
  colnames(ti.train)[16] = "sex_farebin3"
  ti.train[,16] = as.factor(ti.train[,16])

  ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[17] = "sex_cabinX"
  ti.train[,17] = as.factor(ti.train[,17])

  ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[18] = "pclass_cabinX"
  ti.train[,18] = as.factor(ti.train[,18])

  library(e1071)

  # 9VAR version:
  # ti.svm = svm(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3, 

  # 14VAR version:
  ti.svm = svm(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3  
                                     +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX +pclass_cabinX, 
    data = ti.train,
    kernel = "radial",  ## "polynomial", ## 
    gamma= 0.07)

    ## degree= 2)
    ## tolerance= 0.3)

  ti.train.pred = predict(ti.svm)
  ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
  print(ti.train.tab)
  print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))

  print('=============== MODEL: jj11 starting TEST predictions ===============')  

  # Load the testV2 TEST data - 
  # DATA PREP done in Excel
  ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")

  ti.test$sibsp = as.factor(ti.test$sibsp)
  ti.test$parch = as.factor(ti.test$parch)
  ti.test$pclass = as.factor(ti.test$pclass)
  ti.test$caibinlen5 = as.factor(ti.test$caibinlen5)
  ti.test$agebin3 = as.factor(ti.test$agebin3)
  ti.test$farebin3 = as.factor(ti.test$farebin3)

  # -------- PROPOSED Secret Sauce primer: ADD combo variables to get from 9VAR to 12VAR -----------

  ti.test[,14] = paste(ti.test[,"sex"], ti.test[,"pclass"], sep="")
  colnames( ti.test)[14] = "sex_pclass"
  ti.test[,14] = as.factor( ti.test[,14])

  ti.test[,15] = paste(ti.test[,"sex"], ti.test[,"agebin3"], sep="")
  colnames(ti.test)[15] = "sex_agebin3"
  ti.test[,15] = as.factor(ti.test[,15])

  ti.test[,16] = paste(ti.test[,"sex"], ti.test[,"farebin3"], sep="")
  colnames(ti.test)[16] = "sex_farebin3"
  ti.test[,16] = as.factor(ti.test[,16])

  ti.test[,17] = paste(ti.test[,"sex"], ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
  colnames(ti.test)[17] = "sex_cabinX"
  ti.test[,17] = as.factor(ti.test[,17])

  ti.test[,18] = paste(ti.test[,"pclass"],ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
  colnames(ti.test)[18] = "pclass_cabinX"
  ti.test[,18] = as.factor(ti.test[,18])

  print("test predictions being output to ti.test.pred...")

  # 9VAR version:
  # ti.test.pred = predict(ti.svm, ti.test[, c("sex", "sibsp", "parch", "pclass", 
  #   "embarked", "cabincode", "caibinlen5", "agebin3", "farebin3")])

  # 14VAR version:
  ti.test.pred = predict(ti.svm, ti.test[, c("sex", "sibsp", "parch", "pclass", 
      "embarked", "cabincode", "caibinlen5", "agebin3", "farebin3", "sex_pclass", "sex_agebin3", "sex_farebin3",
          "sex_cabinX", "pclass_cabinX" )])

  # Save predictions
  write.csv(ti.test.pred, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj11H.csv", 
  row.names=FALSE, col.names=FALSE, quote = FALSE)

}






#
# jj91E - 14VAR gbm EVAL script - choose BEST params, then use jj91 routine to create test predictions for SUBMIT
# -------
jj91E = function() {

  # accuracy w 1/3 holdout train data 14VAR                                                      : train-avg test-avg
  # ---------------------------------------------------------------------------------------------  --------- ---------

  # gaussian n.tree= 100  (0->1 cutoff .40)  shrinkage=.001 bag.fraction=dflt interaction.depth=7  0.8466330 0.8242424 DEC 4: TRY gaussian, test cuffoff pct...

  # gaussian n.tree= 500  (0->1 cutoff .40)  shrinkage=.001 bag.fraction=dflt interaction.depth=4  0.8506734 0.8249158
  # gaussian n.tree= 500  (0->1 cutoff .40)  shrinkage=.001 bag.fraction=dflt interaction.depth=2  0.8109428 0.8087542 <-- diff .002

  # bernoulli n.tree= 100 (0->1 cutoff -.40) shrinkage=.001 bag.fraction=dflt interaction.depth=4  0.8338384 0.8228956 <-- diff .011
  # bernoulli n.tree= 500 (0->1 cutoff -.40) shrinkage=.001 bag.fraction=dflt interaction.depth=2  0.8144781 0.8111111 <-- diff .003
  # bernoulli n.tree= 500 (0->1 cutoff -.40) shrinkage=.001 bag.fraction=dflt interaction.depth=4  0.8488215 0.8242424

  # bernoulli n.tree= 500 (0->1 cutoff -.40) shrinkage=.006 bag.fraction=dflt interaction.depth=2  0.8434343 0.8245791 

  # bernoulli n.tree=2000 (0->1 cutoff .45)  shrinkage=.008 bag.fraction=dflt interaction.depth=1  0.8309764 0.8272727 <-- diff .004 0.76555 Kaggle public score as _jj91B
  # bernoulli n.tree= 500 (0->1 cutoff .45)  shrinkage=.012 bag.fraction=dflt interaction.depth=2  0.8365320 0.8279461 <-- diff .008 0.xxxxx _jj91C only 1DIFF from _jj91B

  # bernoulli n.tree= 500 (0->1 cutoff -.36) shrinkage=.001 bag.fraction=dflt interaction.depth=2  0.8190236 0.8134680 <-- diff .005 0.78469 Kaggle public score as _jj91A
  # bernoulli n.tree= 500 (0->1 cutoff -.36) shrinkage=.001 bag.fraction=dflt interaction.depth=3  0.8380471 0.8232323

  # bernoulli n.tree=2000 (0->1 cutoff -.36) shrinkage=.001 bag.fraction=dflt interaction.depth=3  0.8356902 0.8158249 <-- diff .019

  # bernoulli n.tree=8000 (0->1 cutoff -.36) shrinkage=.001 bag.fraction=dflt interaction.depth=1  0.8439394 0.8309764 <-- diff .013 0.77990 Kaggle public score as _jj91
  # bernoulli n.tree=8000 (0->1 cutoff -.36) shrinkage=.001 bag.fraction=dflt interaction.depth=2  0.8550505 0.8313131



  print('=============== MODEL: jj91E gbm trainV2 variations (xxxx)===============') 

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")

  # All vars are as-read, NO use of as.factor(...)

  # -------- PROPOSED Secret Sauce primer: ADD combo variables to get from 9VAR to 12VAR -----------

  ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
  colnames( ti.train)[14] = "sex_pclass"
  ti.train[,14] = as.factor( ti.train[,14])

  ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
  colnames(ti.train)[15] = "sex_agebin3"
  ti.train[,15] = as.factor(ti.train[,15])

  ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
  colnames(ti.train)[16] = "sex_farebin3"
  ti.train[,16] = as.factor(ti.train[,16])

  ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[17] = "sex_cabinX"
  ti.train[,17] = as.factor(ti.train[,17])

  ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[18] = "pclass_cabinX"
  ti.train[,18] = as.factor(ti.train[,18])

  # Build a table to collect data for the next two steps: colnames = batch, cp, trainerr, testerr
  batch = numeric()
  idepth = numeric()
  trainerr = numeric()
  testerr = numeric()
  trainacc = numeric()
  testacc = numeric()
  ti.table = data.frame(batch=batch, idepth=idepth, trainerr=trainerr, testerr=testerr, trainacc=trainacc, testacc=testacc)

  # Make the seed vector to serve as unique sample seed and also as batch number 1-10
  seedvector = 1:10 ## 1:5 ##  

  library(gbm)
  par(mfrow=c(2,5))

  ### nvector = 1:4     ## 1:10
  ### nvector = 8 ## c(.0400, .0350, .0300, .0250, .0200, .0150, .0100, .0080, .0060, .0040)
  ### nvector = c(.006, .008, .01, .012, .014) 
  nvector = c(.008, .01, .012, .014, .016, .018, .02, .022, .024)
  ### nvector = c(100, 300, 400, 500, 600, 700, 800, 1000)
  ### nvector = c(.2, .3, .35, .4, .45, .5, .6)

  # Iterate over seedvector and nvector to train and test batches of trees
  for (s in 1:length(seedvector)) {
    # Reset sample seed
    set.seed(seedvector[s])
    # Slice new training and test sets with same ratio
    testids = sample(nrow(ti.train), nrow(ti.train)/3)
    ti.testset = ti.train[testids, ]
    ti.trainset = ti.train[ - testids, ]
    # Iterate the idepth values for this batch
    for (i in 1:length(nvector)) {

      ti.gbm = gbm(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3
                              +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX +pclass_cabinX, 
        data=ti.trainset, 
    distribution="bernoulli",
    n.trees=2000,
  shrinkage=nvector[i],
    interaction.depth = 1, 
    cv.folds = 5)

      # ---------- Perf evals from R docs --------------
      # check performance using 5-fold cross-validation
      best.iter <- gbm.perf(ti.gbm, method="cv")
      print(best.iter)
      # ---------- Perf evals from docs --------------

      ti.train.pred = predict(ti.gbm, ti.trainset, best.iter)
      ti.test.pred = predict(ti.gbm, ti.testset, best.iter)

      # ti.train.pred01 = ifelse(ti.train.pred < 0.5, 0, 1)
      # ti.test.pred01 = ifelse(ti.test.pred < 0.5, 0, 1)

      # use for gaussian?
      ti.train.pred01 = ifelse(ti.train.pred < 0.45, 0, 1)
      ti.test.pred01 = ifelse(ti.test.pred < 0.45, 0, 1)

      # ti.train.pred01 = ifelse(ti.train.pred < nvector[i], 0, 1)
      # ti.test.pred01 = ifelse(ti.test.pred < nvector[i], 0, 1)

      # use for bernoulli? NOTE these values are NEGATIVE
      # ti.train.pred01 = ifelse(ti.train.pred < -0.36, 0, 1)
      # ti.test.pred01 = ifelse(ti.test.pred < -0.36, 0, 1)

      ti.train.error = sum(ti.train.pred01 != ti.trainset$survived)/nrow(ti.trainset)
      ti.test.error = sum(ti.test.pred01 != ti.testset$survived)/nrow(ti.testset)
      ti.train.acc = accuracyFrom2x2ConfusionMatrix(table(ti.train.pred01, ti.trainset$survived))
      ti.test.acc = accuracyFrom2x2ConfusionMatrix(table(ti.test.pred01, ti.testset$survived))

      # Populate a row of the table... s now serves as batch number in col 1
      ti.table[s*length(nvector) -length(nvector) +i,] = 
        c(s, nvector[i], ti.train.error, ti.test.error, ti.train.acc, ti.test.acc)
    }
  }

  # Sanity check of ti table looks good – we have 10 batches of 10 with same idepth series
  dim(ti.table)
  ti.table[1:10,]

  # Use existing table as template to create a table for the train and test errors avg by idepth
  ti.table.nmeans = ti.table[1==0,] 
  # Empty table for nmeans
  dim(ti.table.nmeans)

  # Populate the means by idepth
  for (i in 1:length(nvector)) {
    ti.table.nmeans[i,] = c(1, nvector[i],
    mean(ti.table[ti.table$idepth == nvector[i], 3]),
    mean(ti.table[ti.table$idepth == nvector[i], 4]),
    mean(ti.table[ti.table$idepth == nvector[i], 5]),
    mean(ti.table[ti.table$idepth == nvector[i], 6]))
  }

  # Sanity check – series look smoother
  ti.table.nmeans

  par(mfrow=c(2,1))

  plot(ti.table.nmeans$idepth, ti.table.nmeans$trainerr, pch=22, bg="blue", type="b",
    main="Avg Training vs Test Error by idepth param", xlab="idepth param", ylab="Error", ylim=c(.10, .40))
  points(ti.table.nmeans$idepth, ti.table.nmeans$testerr, pch=22, bg="red", type="b")

  plot(ti.table.nmeans$idepth, ti.table.nmeans$trainacc, pch=22, bg="blue", type="b",
    main="Avg Training vs Test Accuracy by idepth param", xlab="idepth param", ylab="Accuracy", ylim=c(.60, .90))
  points(ti.table.nmeans$idepth, ti.table.nmeans$testacc, pch=22, bg="red", type="b")

  print('=============== END jj91E ===============')  

}





#
# jj91
# -------
jj91 = function() {

  print('=============== MODEL: jj91 gbm trainV2 as read in (8810) ===============')  

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")

  # -------- PROPOSED Secret Sauce primer: ADD combo variables to get from 9VAR to 12VAR -----------

  ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
  colnames( ti.train)[14] = "sex_pclass"
  ti.train[,14] = as.factor( ti.train[,14])

  ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
  colnames(ti.train)[15] = "sex_agebin3"
  ti.train[,15] = as.factor(ti.train[,15])

  ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
  colnames(ti.train)[16] = "sex_farebin3"
  ti.train[,16] = as.factor(ti.train[,16])

  ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[17] = "sex_cabinX"
  ti.train[,17] = as.factor(ti.train[,17])

  ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[18] = "pclass_cabinX"
  ti.train[,18] = as.factor(ti.train[,18])

  # Note - response var must be numeric for gbm
  library(gbm)
  ti.gbm = gbm(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin +farebin3 
                          +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX +pclass_cabinX,  
  data=ti.train, 
    distribution="bernoulli",
    n.trees=500,
    interaction.depth = 2, 
    shrinkage=0.012,
    cv.folds = 5)

  par(mfrow=c(1,3))
  print(plot(ti.gbm))

  # ---------- Perf evals from R docs --------------
  # check performance using 5-fold cross-validation
  best.iter <- gbm.perf(ti.gbm,method="cv")
  print(best.iter)

  # plot variable influence
  summary(ti.gbm,n.trees=best.iter) # based on the estimated best number of trees
  # ---------- Perf evals from docs --------------

  ti.train.pred = predict(ti.gbm, ti.train, best.iter)
  ti.train.pred01 = ifelse(ti.train.pred < 0.45, 0, 1)

  ti.train.tab = table(ti.train.pred01, ti.train$survived, dnn=c("predicted","actual"))

  print(ti.train.tab)
  print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))

  print('=============== MODEL: jj91 starting TEST predictions ===============')  

  # Load the testV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")

  ti.test[,14] = paste(ti.test[,"sex"], ti.test[,"pclass"], sep="")
  colnames( ti.test)[14] = "sex_pclass"
  ti.test[,14] = as.factor( ti.test[,14])

  ti.test[,15] = paste(ti.test[,"sex"], ti.test[,"agebin3"], sep="")
  colnames(ti.test)[15] = "sex_agebin3"
  ti.test[,15] = as.factor(ti.test[,15])

  ti.test[,16] = paste(ti.test[,"sex"], ti.test[,"farebin3"], sep="")
  colnames(ti.test)[16] = "sex_farebin3"
  ti.test[,16] = as.factor(ti.test[,16])

  ti.test[,17] = paste(ti.test[,"sex"], ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
  colnames(ti.test)[17] = "sex_cabinX"
  ti.test[,17] = as.factor(ti.test[,17])

  ti.test[,18] = paste(ti.test[,"pclass"],ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
  colnames(ti.test)[18] = "pclass_cabinX"
  ti.test[,18] = as.factor(ti.test[,18])

  print("test predictions being output to ti.test.pred...")

  ti.test.pred = predict(ti.gbm, ti.test, best.iter)
  ti.test.pred01 = ifelse(ti.test.pred < 0.45, 0, 1)

  # Save predictions
  write.csv(ti.test.pred01, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj91C.csv", 
    row.names=FALSE, col.names=FALSE, quote = FALSE)

}






#
# jj9E - gbm EVAL script - choose BEST params, then use jj9 routine to create test predictions for SUBMIT
# -------
jj9E = function() {

  # accuracy w 1/3 holdout train data                                           : train-avg test-avg
  # ---------------------------------------------------------------------------   ------    ------
  # gaussian n.tree=2000 shrinkage=.003 bag.fraction=dflt interaction.depth=8   : 0.8632997 0.8720539 0.78469 << -- (BAD traindata) Kaggle public score as _jj9
  # gaussian n.tree=2000 shrinkage=.003 bag.fraction=0.7  interaction.depth=8   : 0.8663300 0.8754209
  # gaussian n.tree=2000 shrinkage=.003 bag.fraction=0.2  interaction.depth=8   : 0.8613917 0.8709315 0.76555 << -- (BAD traindata) Kaggle public score as _jj9B

  # bernoulli n.tree=2000 shrinkage=.010 bag.fraction=dflt interaction.depth=5  : 0.8479237 0.8563412
  # bernoulli n.tree=2000 shrinkage=.003 bag.fraction=dflt interaction.depth=6  : 0.8473625 0.8574635
  # bernoulli n.tree=9000 shrinkage=.001 bag.fraction=dflt interaction.depth=5  : 0.8462402 0.8552189 0.74641 << -- (BAD traindata) Kaggle public score as _jj9C

  # adaboost n.tree=9000 shrinkage=.001 bag.fraction=dflt interaction.depth=7   : 0.8187430 0.8148148 


  print('=============== MODEL: jj9E gbm trainV2 variations (xxxx)===============') 

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")

  # All vars are as-read, NO use of as.factor(...)

  # Build a table to collect data for the next two steps: colnames = batch, cp, trainerr, testerr
  batch = numeric()
  idepth = numeric()
  trainerr = numeric()
  testerr = numeric()
  trainacc = numeric()
  testacc = numeric()
  ti.table = data.frame(batch=batch, idepth=idepth, trainerr=trainerr, testerr=testerr, trainacc=trainacc, testacc=testacc)

  # Make the seed vector to serve as unique sample seed and also as batch number 1-10
  seedvector = 1:3 ## 1:5 ##  1:10

  library(gbm)
  par(mfrow=c(2,5))
  idepthvector = 8 ## c(.0400, .0350, .0300, .0250, .0200, .0150, .0100, .0080, .0060, .0040)

  # Iterate over seedvector and idepthvector to train and test batches of trees
  for (s in 1:length(seedvector)) {
    # Reset sample seed
    set.seed(seedvector[s])
    # Slice new training and test sets with same ratio
    testids = sample(nrow(ti.train), nrow(ti.train)/3)
    ti.testset = ti.train[testids, ]
    ti.trainset = ti.train[ - testids, ]
    # Iterate the idepth values for this batch
    for (i in 1:length(idepthvector)) {

      ti.gbm = gbm(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3, 
        ## data=ti.train, 
        ## corrected to trainset!
        data=ti.trainset, 
    distribution="bernoulli", # "gaussian", # "adaboost", ## 
    n.trees=9000,             # 10000, 
        shrinkage=0.001, 
    interaction.depth = idepthvector[i], 
    cv.folds = 5)

      # ---------- Perf evals from R docs --------------
      # check performance using 5-fold cross-validation
      best.iter <- gbm.perf(ti.gbm, method="cv")
      print(best.iter)
      # ---------- Perf evals from docs --------------

      ti.train.pred = predict(ti.gbm, ti.trainset, best.iter)
      ti.train.pred01 = ifelse(ti.train.pred < 0.5, 0, 1)
      ti.test.pred = predict(ti.gbm, ti.testset, best.iter)
      ti.test.pred01 = ifelse(ti.test.pred < 0.5, 0, 1)

      ti.train.error = sum(ti.train.pred01 != ti.trainset$survived)/nrow(ti.trainset)
      ti.test.error = sum(ti.test.pred01 != ti.testset$survived)/nrow(ti.testset)
      ti.train.acc = accuracyFrom2x2ConfusionMatrix(table(ti.train.pred01, ti.trainset$survived))
      ti.test.acc = accuracyFrom2x2ConfusionMatrix(table(ti.test.pred01, ti.testset$survived))

      # Populate a row of the table... s now serves as batch number in col 1
      ti.table[s*length(idepthvector) -length(idepthvector) +i,] = 
        c(s, idepthvector[i], ti.train.error, ti.test.error, ti.train.acc, ti.test.acc)
    }
  }

  # Sanity check of ti table looks good – we have 10 batches of 10 with same idepth series
  dim(ti.table)
  ti.table[1:10,]

  # Use existing table as template to create a table for the train and test errors avg by idepth
  ti.table.idepthmeans = ti.table[1==0,] 
  # Empty table for idepthmeans
  dim(ti.table.idepthmeans)

  # Populate the means by idepth
  for (i in 1:length(idepthvector)) {
    ti.table.idepthmeans[i,] = c(1, idepthvector[i],
    mean(ti.table[ti.table$idepth == idepthvector[i], 3]),
    mean(ti.table[ti.table$idepth == idepthvector[i], 4]),
    mean(ti.table[ti.table$idepth == idepthvector[i], 5]),
    mean(ti.table[ti.table$idepth == idepthvector[i], 6]))
  }

  # Sanity check – series look smoother
  ti.table.idepthmeans

  par(mfrow=c(2,1))

  plot(ti.table.idepthmeans$idepth, ti.table.idepthmeans$trainerr, pch=22, bg="blue", type="b",
    main="Avg Training vs Test Error by idepth param", xlab="idepth param", ylab="Error", ylim=c(.10, .40))
  points(ti.table.idepthmeans$idepth, ti.table.idepthmeans$testerr, pch=22, bg="red", type="b")

  plot(ti.table.idepthmeans$idepth, ti.table.idepthmeans$trainacc, pch=22, bg="blue", type="b",
    main="Avg Training vs Test Accuracy by idepth param", xlab="idepth param", ylab="Accuracy", ylim=c(.60, .90))
  points(ti.table.idepthmeans$idepth, ti.table.idepthmeans$testacc, pch=22, bg="red", type="b")

  print('=============== MODEL: jj9E starting TEST predictions ===============')  

}



#
# jj9
# -------
jj9 = function() {

  print('=============== MODEL: jj9 gbm trainV2 as read in (8810) ===============')  

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")

# ti.train$sibsp = as.factor(ti.train$sibsp)
# ti.train$parch = as.factor(ti.train$parch)
# ti.train$pclass = as.factor(ti.train$pclass)
# ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
# ti.train$agebin3 = as.factor(ti.train$agebin3)
# ti.train$farebin3 = as.factor(ti.train$farebin3)

  # Note - response var must be numeric for gbm
  library(gbm)
  ti.gbm = gbm(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3, 
        data=ti.train, 
    distribution="bernoulli", ## "gaussian", # "adaboost", # 
    n.trees=9000, interaction.depth = 5, shrinkage=0.001,
    cv.folds = 5)

  par(mfrow=c(2,1))
  print(plot(ti.gbm))

  # ---------- Perf evals from R docs --------------
  # check performance using an out-of-bag estimator
  # OOB underestimates the optimal number of iterations
  best.iter <- gbm.perf(ti.gbm,method="OOB")
  print(best.iter)

  # check performance using 5-fold cross-validation
  best.iter <- gbm.perf(ti.gbm,method="cv")
  print(best.iter)

  # plot variable influence
  summary(ti.gbm,n.trees=best.iter) # based on the estimated best number of trees
  # ---------- Perf evals from docs --------------

  ti.train.pred = predict(ti.gbm, ti.train, best.iter)
  ti.train.pred01 = ifelse(ti.train.pred < 0.5, 0, 1)

  ti.train.tab = table(ti.train.pred01, ti.train$survived, dnn=c("predicted","actual"))

  print(ti.train.tab)
  print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))

  print('=============== MODEL: jj9 starting TEST predictions ===============')  

  # Load the testV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")

  print("test predictions being output to ti.test.pred...")

  # ti.test.pred = predict(ti.rf, 
  # ti.test[, c("pclass", "sex", "sibsp", "parch", "embarked", "cabincode", "caibinlen5", "agebin3", "farebin3")] )

  ti.test.pred = predict(ti.gbm, ti.test, best.iter)
  ti.test.pred01 = ifelse(ti.test.pred < 0.5, 0, 1)

  # Save predictions
  write.csv(ti.test.pred01, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj9C.csv", 
    row.names=FALSE, col.names=FALSE, quote = FALSE)

}




#
# jj8E - randomForest with 12VAR COMBO features EVAL script - choose BEST params, then MAKE/use jj8 routine to create test predictions for SUBMIT
# -------             -------------------------

jj8E = function() {


  # accuracy w 1/3 holdout train data             : train     test 
  # --------------------------------------------    --------- ---------

  # * 9VAR  = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3
  # * 12VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass       +sex_agebin3 +sex_farebin3
  # * 8SVAR = sibsp +pclass +cabincode +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX


  print('=============== MODEL: jj8E randomForest trainV2 12VAR variations ===============')  

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")

  # COMMENT out survived to do REGRESSION (need to convert response to 0/1 value)
  # ti.train$survived = as.factor(ti.train$survived)
  ti.train$sibsp = as.factor(ti.train$sibsp) 
  ti.train$parch = as.factor(ti.train$parch)
  ti.train$pclass = as.factor(ti.train$pclass)
  ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
  ti.train$agebin3 = as.factor(ti.train$agebin3)
  ti.train$farebin3 = as.factor(ti.train$farebin3)

  # -------- PROPOSED Secret Sauce primer: ADD combo variables -----------

  ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
  colnames( ti.train)[14] = "sex_pclass"
  ti.train[,14] = as.factor( ti.train[,14])

  ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
  colnames(ti.train)[15] = "sex_agebin3"
  ti.train[,15] = as.factor(ti.train[,15])

  ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
  colnames(ti.train)[16] = "sex_farebin3"
  ti.train[,16] = as.factor(ti.train[,16])

  ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[17] = "sex_cabinX"
  ti.train[,17] = as.factor(ti.train[,17])

  ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[18] = "pclass_cabinX"
  ti.train[,18] = as.factor(ti.train[,18])

  # Build a table to collect data for the next two steps: colnames = batch, nt, trainerr, testerr
  batch = numeric()
  nt = numeric()
  trainerr = numeric()
  testerr = numeric()
  trainacc = numeric()
  testacc = numeric()
  ti.table = data.frame(batch=batch, nt=nt, trainerr=trainerr, testerr=testerr, trainacc=trainacc, testacc=testacc)

  # Make the seed vector to serve as unique sample seed and also as batch number 1-10
  seedvector = 1:10
  ntvector = c(40, 60, 80, 100, 200, 400, 600, 1000, 2000, 4000)

  library(randomForest)

  # Iterate over seedvector and ntvector to train and test batches of trees
  for (s in 1:length(seedvector)) {
    # Reset sample seed
    set.seed(seedvector[s])
    # Slice new training and test sets with same ratio
    testids = sample(nrow(ti.train), nrow(ti.train)/3)
    ti.testset = ti.train[testids, ]
    ti.trainset = ti.train[ - testids, ]
    # Iterate the nt values for this batch
    for (i in 1:length(ntvector)) {

      # 12VAR version: 
      # ti.rf = randomForest(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3, 

      # 8SVAR version:
      ti.rf = randomForest(survived ~ sibsp +pclass +cabincode +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX, 

      data = ti.trainset, nt=ntvector[i])

      ti.train.predictions = predict(ti.rf)
      ti.test.predictions = predict(ti.rf, 

      # 12VAR version: 
      # ti.testset[, c("pclass", "sex", "sibsp", "parch", "embarked", "cabincode", "caibinlen5", "agebin3", "farebin3", "sex_pclass", "sex_agebin3", "sex_farebin3")] )

      # 8SVAR version: 
      ti.testset[, c("sibsp", "pclass", "cabincode", "sex_pclass", "sex_agebin3", "sex_farebin3", "sex_cabinX", "pclass_cabinX")] )

      # ONLY if using regression instead of classification: 
      ti.train.predictions = ifelse(ti.train.predictions < .36, 0, 1)
      ti.test.predictions = ifelse(ti.test.predictions < .36, 0, 1)

      ti.train.error = sum(ti.train.predictions != ti.trainset$survived)/nrow(ti.trainset)
      ti.test.error = sum(ti.test.predictions != ti.testset$survived)/nrow(ti.testset)
      ti.train.acc = accuracyFrom2x2ConfusionMatrix(table(ti.train.predictions, ti.trainset$survived))
      ti.test.acc = accuracyFrom2x2ConfusionMatrix(table(ti.test.predictions, ti.testset$survived))

      # Populate a row of the table... s now serves as batch number in col 1
      ti.table[s*10 -10 +i,] = c(s, ntvector[i], ti.train.error, ti.test.error, ti.train.acc, ti.test.acc)
    }
  }
  # Sanity check of ti table looks good – we have 10 batches of 10 with same nt series
  dim(ti.table)
  ti.table[1:4,]
  ti.table[11:14,]

  # Use existing table as template to create a table for the train and test errors avg by nt
  ti.table.ntmeans = ti.table[1==0,] 
  # Empty table for ntmeans
  dim(ti.table.ntmeans)

  # Populate the means by nt
  for (i in 1:length(ntvector)) {
    ti.table.ntmeans[i,] = c(1, ntvector[i],
    mean(ti.table[ti.table$nt == ntvector[i], 3]),
    mean(ti.table[ti.table$nt == ntvector[i], 4]),
    mean(ti.table[ti.table$nt == ntvector[i], 5]),
    mean(ti.table[ti.table$nt == ntvector[i], 6]))
  }

  # Sanity check – series look smoother
  ti.table.ntmeans

  par(mfrow=c(2,1))

  plot(ti.table.ntmeans$nt, ti.table.ntmeans$trainerr, pch=22, bg="blue", type="b",
    main="Avg Training vs Test Error by nt param", xlab="nt param", ylab="Error", ylim=c(.15, .21))
  points(ti.table.ntmeans$nt, ti.table.ntmeans$testerr, pch=22, bg="red", type="b")

  plot(ti.table.ntmeans$nt, ti.table.ntmeans$trainacc, pch=22, bg="blue", type="b",
    main="Avg Training vs Test Accuracy by nt param", xlab="nt param", ylab="Accuracy", ylim=c(.78, .86))
  points(ti.table.ntmeans$nt, ti.table.ntmeans$testacc, pch=22, bg="red", type="b")

  print('=============== END: jj8E ===============')  

}



#
# jj8
# -------
jj8 = function() {


  print('=============== MODEL: jj8 randomForest trainV2 12VAR as factors ===============')  

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")

  # COMMENT 1st row to get regression response (must convert to 0/1 value)
  # ti.train$survived = as.factor(ti.train$survived)
  ti.train$sibsp = as.factor(ti.train$sibsp)
  ti.train$parch = as.factor(ti.train$parch)
  ti.train$pclass = as.factor(ti.train$pclass)
  ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
  ti.train$agebin3 = as.factor(ti.train$agebin3)
  ti.train$farebin3 = as.factor(ti.train$farebin3)

  # -------- PROPOSED Secret Sauce primer: ADD combo variables -----------

  ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
  colnames( ti.train)[14] = "sex_pclass"
  ti.train[,14] = as.factor( ti.train[,14])

  ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
  colnames(ti.train)[15] = "sex_agebin3"
  ti.train[,15] = as.factor(ti.train[,15])

  ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
  colnames(ti.train)[16] = "sex_farebin3"
  ti.train[,16] = as.factor(ti.train[,16])

  ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[17] = "sex_cabinX"
  ti.train[,17] = as.factor(ti.train[,17])

  ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[18] = "pclass_cabinX"
  ti.train[,18] = as.factor(ti.train[,18])

  library(randomForest)

  # 12VAR version:
  # ti.rf = randomForest(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3, 
  #         data = ti.train, ntree = 100)

  # 8SVAR version:
  ti.rf = randomForest(survived ~ sibsp +pclass +cabincode +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX,  
          data = ti.train, ntree = 60)

  # print(plot(ti.rf))
  ti.train.pred = predict(ti.rf)

  # ONLY if using regression instead of classification: 
  ti.train.pred = ifelse(ti.train.pred < .36, 0, 1)

  ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
  print(ti.train.tab)
  print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))

  print('=============== MODEL: jj8 starting TEST predictions ===============')  

  # Load the testV2 TEST data - 
  # DATA PREP done in Excel
  ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")

  ti.test$sibsp = as.factor(ti.test$sibsp)
  ti.test$parch = as.factor(ti.test$parch)
  ti.test$pclass = as.factor(ti.test$pclass)
  ti.test$caibinlen5 = as.factor(ti.test$caibinlen5)
  ti.test$agebin3 = as.factor(ti.test$agebin3)
  ti.test$farebin3 = as.factor(ti.test$farebin3)

  # -------- PROPOSED Secret Sauce primer: ADD combo variables -----------

  ti.test[,14] = paste(ti.test[,"sex"], ti.test[,"pclass"], sep="")
  colnames( ti.test)[14] = "sex_pclass"
  ti.test[,14] = as.factor( ti.test[,14])

  ti.test[,15] = paste(ti.test[,"sex"], ti.test[,"agebin3"], sep="")
  colnames(ti.test)[15] = "sex_agebin3"
  ti.test[,15] = as.factor(ti.test[,15])

  ti.test[,16] = paste(ti.test[,"sex"], ti.test[,"farebin3"], sep="")
  colnames(ti.test)[16] = "sex_farebin3"
  ti.test[,16] = as.factor(ti.test[,16])

  ti.test[,17] = paste(ti.test[,"sex"], ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
  colnames(ti.test)[17] = "sex_cabinX"
  ti.test[,17] = as.factor(ti.test[,17])

  ti.test[,18] = paste(ti.test[,"pclass"],ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
  colnames(ti.test)[18] = "pclass_cabinX"
  ti.test[,18] = as.factor(ti.test[,18])

  print("test predictions being output to ti.test.pred...")

  ti.test.pred = predict(ti.rf, 

  # 12VAR version: 
  # ti.test[, c("pclass", "sex", "sibsp", "parch", "embarked", "cabincode", "caibinlen5", "agebin3", "farebin3", "sex_pclass", "sex_agebin3", "sex_farebin3")] )

  # 8SVAR version: 
  ti.test[, c("sibsp", "pclass", "cabincode", "sex_pclass", "sex_agebin3", "sex_farebin3", "sex_cabinX", "pclass_cabinX")] )

  # ONLY if using regression instead of classification: 
  ti.test.pred = ifelse(ti.test.pred < .36, 0, 1)

  # Save predictions
  write.csv(ti.test.pred, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj8B.csv", 
    row.names=FALSE, col.names=FALSE, quote = FALSE)

}





#
# jj73E - rpart with 8SVAR COMBO features EVAL script - choose BEST cp value, then MAKE/use jj71 routine to create test predictions for SUBMIT
# -------            -------------------------

jj73E = function() {


  # accuracy w 1/3 holdout train data                      : train     test       nleaf

  # rpart 8SVAR as-factors  minsplit = 10 BEST cp = 0.006  : 0.xxxxxxx 0.xxxxxxx   12+      76077  << -- Kaggle public score as _jj73


  # * 9VAR  = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3
  # * 12VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3
  # * 8SVAR = sibsp +pclass +cabincode +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX


  print('=============== MODEL: jj73E rpart trainV2 12VAR variations (xxxx)===============')  

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")


  ti.train$survived = as.factor(ti.train$survived)
  ti.train$sibsp = as.factor(ti.train$sibsp) 
  ti.train$parch = as.factor(ti.train$parch)
  ti.train$pclass = as.factor(ti.train$pclass)
  ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
  ti.train$agebin3 = as.factor(ti.train$agebin3)
  ti.train$farebin3 = as.factor(ti.train$farebin3)

  # -------- PROPOSED Secret Sauce primer: ADD combo variables -----------

  ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
  colnames( ti.train)[14] = "sex_pclass"
  ti.train[,14] = as.factor( ti.train[,14])

  ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
  colnames(ti.train)[15] = "sex_agebin3"
  ti.train[,15] = as.factor(ti.train[,15])

  ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
  colnames(ti.train)[16] = "sex_farebin3"
  ti.train[,16] = as.factor(ti.train[,16])

  ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[17] = "sex_cabinX"
  ti.train[,17] = as.factor(ti.train[,17])

  ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[18] = "pclass_cabinX"
  ti.train[,18] = as.factor(ti.train[,18])

  # Build a table to collect data for the next two steps: colnames = batch, cp, trainerr, testerr
  batch = numeric()
  cp = numeric()
  trainerr = numeric()
  testerr = numeric()
  trainacc = numeric()
  testacc = numeric()
  ti.table = data.frame(batch=batch, cp=cp, trainerr=trainerr, testerr=testerr, trainacc=trainacc, testacc=testacc)

  # Make the seed vector to serve as unique sample seed and also as batch number 1-10
  seedvector = 1:10

  # cpvector = .015
  cpvector = c(.0400, .0350, .0300, .0250, .0200, .0150, .0100, .0080, .0060, .0040)

  par(mfrow = c(2,6))

  # Iterate over seedvector and cpvector to train and test batches of trees
  for (s in 1:length(seedvector)) {
    # Reset sample seed
    set.seed(seedvector[s])
    # Slice new training and test sets with same ratio
    testids = sample(nrow(ti.train), nrow(ti.train)/3)
    ti.testset = ti.train[testids, ]
    ti.trainset = ti.train[ - testids, ]
    # Iterate the cp values for this batch
    for (i in 1:length(cpvector)) {

      ti.tree = rpart(survived ~ sibsp +pclass +cabincode +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX, 
        method = "class", 
        data = ti.trainset,
    cp=cpvector[i], 
    minsplit = 10 )

      # if (i %in% c(10)) { print(plot(ti.tree)); print(text(ti.tree)); }

      ti.test.predictions = predict(ti.tree, ti.testset, type = "class")
      ti.train.predictions = predict(ti.tree, type = "class")
      ti.train.error = sum(ti.train.predictions != ti.trainset$survived)/nrow(ti.trainset)
      ti.test.error = sum(ti.test.predictions != ti.testset$survived)/nrow(ti.testset)
      ti.train.acc = accuracyFrom2x2ConfusionMatrix(table(ti.train.predictions, ti.trainset$survived))
      ti.test.acc = accuracyFrom2x2ConfusionMatrix(table(ti.test.predictions, ti.testset$survived))
      # Populate a row of the table... s now serves as batch number in col 1
      ti.table[s*10 -10 +i,] = c(s, cpvector[i], ti.train.error, ti.test.error, ti.train.acc, ti.test.acc)
    }
  }
  # Sanity check of ti table looks good – we have 10 batches of 10 with same cp series
  dim(ti.table)
  ti.table[1:4,]
  ti.table[11:14,]

  # Use existing table as template to create a table for the train and test errors avg by cp
  ti.table.cpmeans = ti.table[1==0,] 
  # Empty table for cpmeans
  dim(ti.table.cpmeans)

  # Populate the means by cp
  for (i in 1:length(cpvector)) {
    ti.table.cpmeans[i,] = c(1, cpvector[i],
    mean(ti.table[ti.table$cp == cpvector[i], 3]),
    mean(ti.table[ti.table$cp == cpvector[i], 4]),
    mean(ti.table[ti.table$cp == cpvector[i], 5]),
    mean(ti.table[ti.table$cp == cpvector[i], 6]))
  }

  # Sanity check – series look smoother
  ti.table.cpmeans

  par(mfrow=c(2,1))

  plot(ti.table.cpmeans$cp, ti.table.cpmeans$trainerr, pch=22, bg="blue", type="b",
    main="Avg Training vs Test Error by cp param", xlab="cp param", ylab="Error", ylim=c(.15, .21))
  points(ti.table.cpmeans$cp, ti.table.cpmeans$testerr, pch=22, bg="red", type="b")

  plot(ti.table.cpmeans$cp, ti.table.cpmeans$trainacc, pch=22, bg="blue", type="b",
    main="Avg Training vs Test Accuracy by cp param", xlab="cp param", ylab="Accuracy", ylim=c(.78, .86))
  points(ti.table.cpmeans$cp, ti.table.cpmeans$testacc, pch=22, bg="red", type="b")

  print('=============== END: jj73E ===============')  

}




#
# jj73
# -------
jj73 = function() {

  print('=============== MODEL: jj73 rpart trainV2 14VAR as factors ===============')  

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")


  ti.train$survived = as.factor(ti.train$survived)
  ti.train$sibsp = as.factor(ti.train$sibsp)
  ti.train$parch = as.factor(ti.train$parch)
  ti.train$pclass = as.factor(ti.train$pclass)
  ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
  ti.train$agebin3 = as.factor(ti.train$agebin3)
  ti.train$farebin3 = as.factor(ti.train$farebin3)

  # Adjusted variable "parch" from 9 to 6 in rows 344 and 367 of testV2.csv to normalize train/test factor vars

  # -------- PROPOSED Secret Sauce primer: ADD combo variables -----------

  ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
  colnames( ti.train)[14] = "sex_pclass"
  ti.train[,14] = as.factor( ti.train[,14])

  ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
  colnames(ti.train)[15] = "sex_agebin3"
  ti.train[,15] = as.factor(ti.train[,15])

  ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
  colnames(ti.train)[16] = "sex_farebin3"
  ti.train[,16] = as.factor(ti.train[,16])

  ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[17] = "sex_cabinX"
  ti.train[,17] = as.factor(ti.train[,17])

  ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[18] = "pclass_cabinX"
  ti.train[,18] = as.factor(ti.train[,18])


  ti.tree = rpart(survived ~ sibsp +pclass +cabincode +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX + pclass_cabinX,  
    method = "class", 
    data = ti.train,
    cp = .004,             ## .015,  ## cp = .015,   # cp = .01, 
    minsplit = 10 )

  par(mfrow=c(1,1))
  print(plot(ti.tree))
  print(text(ti.tree))

  ti.train.pred = predict(ti.tree, type = "class")

  ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
  print(ti.train.tab)
  print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))

  print('=============== MODEL: jj73 starting TEST predictions ===============')  

  # Load the testV2 TEST data - 
  # DATA PREP done in Excel
  ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")

  ti.test$sibsp = as.factor(ti.test$sibsp)
  ti.test$parch = as.factor(ti.test$parch)
  ti.test$pclass = as.factor(ti.test$pclass)
  ti.test$caibinlen5 = as.factor(ti.test$caibinlen5)
  ti.test$agebin3 = as.factor(ti.test$agebin3)
  ti.test$farebin3 = as.factor(ti.test$farebin3)

  # -------- PROPOSED Secret Sauce primer: ADD combo variables -----------

  ti.test[,14] = paste(ti.test[,"sex"], ti.test[,"pclass"], sep="")
  colnames( ti.test)[14] = "sex_pclass"
  ti.test[,14] = as.factor( ti.test[,14])

  ti.test[,15] = paste(ti.test[,"sex"], ti.test[,"agebin3"], sep="")
  colnames(ti.test)[15] = "sex_agebin3"
  ti.test[,15] = as.factor(ti.test[,15])

  ti.test[,16] = paste(ti.test[,"sex"], ti.test[,"farebin3"], sep="")
  colnames(ti.test)[16] = "sex_farebin3"
  ti.test[,16] = as.factor(ti.test[,16])

  ti.test[,17] = paste(ti.test[,"sex"], ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
  colnames(ti.test)[17] = "sex_cabinX"
  ti.test[,17] = as.factor(ti.test[,17])

  ti.test[,18] = paste(ti.test[,"pclass"],ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
  colnames(ti.test)[18] = "pclass_cabinX"
  ti.test[,18] = as.factor(ti.test[,18])

  ti.test.pred = predict(ti.tree, ti.test, type = "class")

  # Save predictions
  write.csv(ti.test.pred, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj73.csv", 
    row.names=FALSE, col.names=FALSE, quote = FALSE)

}







#
# jj72E - rpart with 14VAR COMBO features EVAL script - choose BEST cp value, then MAKE/use jj71 routine to create test predictions for SUBMIT
# -------            -------------------------

jj72E = function() {


  # accuracy w 1/3 holdout train data                      : train     test       nleaf



  # rpart 14VAR as-factors  minsplit = 15 BEST cp = 0.0068 : 0.xxxxxxx 0.xxxxxxx   8  0.78469 << -- Kaggle public score as _jj72B (tweaked to get 8 leaves)

  # rpart 14VAR as-factors  minsplit = 20 BEST cp = 0.0200 : 0.8368687 0.8114478   4



  # * 9VAR  = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3
  # * 12VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3


  print('=============== MODEL: jj72E rpart trainV2 12VAR variations (xxxx)===============')  

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")


  ti.train$survived = as.factor(ti.train$survived)
  ti.train$sibsp = as.factor(ti.train$sibsp) 
  ti.train$parch = as.factor(ti.train$parch)
  ti.train$pclass = as.factor(ti.train$pclass)
  ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
  ti.train$agebin3 = as.factor(ti.train$agebin3)
  ti.train$farebin3 = as.factor(ti.train$farebin3)

  # -------- PROPOSED Secret Sauce primer: ADD combo variables -----------

  ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
  colnames( ti.train)[14] = "sex_pclass"
  ti.train[,14] = as.factor( ti.train[,14])

  ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
  colnames(ti.train)[15] = "sex_agebin3"
  ti.train[,15] = as.factor(ti.train[,15])

  ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
  colnames(ti.train)[16] = "sex_farebin3"
  ti.train[,16] = as.factor(ti.train[,16])

  ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[17] = "sex_cabinX"
  ti.train[,17] = as.factor(ti.train[,17])

  ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[18] = "pclass_cabinX"
  ti.train[,18] = as.factor(ti.train[,18])

  # Build a table to collect data for the next two steps: colnames = batch, cp, trainerr, testerr
  batch = numeric()
  cp = numeric()
  trainerr = numeric()
  testerr = numeric()
  trainacc = numeric()
  testacc = numeric()
  ti.table = data.frame(batch=batch, cp=cp, trainerr=trainerr, testerr=testerr, trainacc=trainacc, testacc=testacc)

  # Make the seed vector to serve as unique sample seed and also as batch number 1-10
  seedvector = 1:10

  # cpvector = .0200
  cpvector = c(.0400, .0350, .0300, .0250, .0200, .0150, .0100, .0080, .0060, .0040)

  # Iterate over seedvector and cpvector to train and test batches of trees
  for (s in 1:length(seedvector)) {
    # Reset sample seed
    set.seed(seedvector[s])
    # Slice new training and test sets with same ratio
    testids = sample(nrow(ti.train), nrow(ti.train)/3)
    ti.testset = ti.train[testids, ]
    ti.trainset = ti.train[ - testids, ]
    # Iterate the cp values for this batch
    for (i in 1:length(cpvector)) {

      ti.tree = rpart(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 
                                     +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX +pclass_cabinX, 
        method = "class", 
        data = ti.trainset,
    cp=cpvector[i], 
    minsplit = 20)        ## 20 )

      # if (i %in% c(1,3,5,7)) { print(plot(ti.tree)); print(text(ti.tree)); }

      ti.test.predictions = predict(ti.tree, ti.testset, type = "class")
      ti.train.predictions = predict(ti.tree, type = "class")
      ti.train.error = sum(ti.train.predictions != ti.trainset$survived)/nrow(ti.trainset)
      ti.test.error = sum(ti.test.predictions != ti.testset$survived)/nrow(ti.testset)
      ti.train.acc = accuracyFrom2x2ConfusionMatrix(table(ti.train.predictions, ti.trainset$survived))
      ti.test.acc = accuracyFrom2x2ConfusionMatrix(table(ti.test.predictions, ti.testset$survived))
      # Populate a row of the table... s now serves as batch number in col 1
      ti.table[s*10 -10 +i,] = c(s, cpvector[i], ti.train.error, ti.test.error, ti.train.acc, ti.test.acc)
    }
  }
  # Sanity check of ti table looks good – we have 10 batches of 10 with same cp series
  dim(ti.table)
  ti.table[1:4,]
  ti.table[11:14,]

  # Use existing table as template to create a table for the train and test errors avg by cp
  ti.table.cpmeans = ti.table[1==0,] 
  # Empty table for cpmeans
  dim(ti.table.cpmeans)

  # Populate the means by cp
  for (i in 1:length(cpvector)) {
    ti.table.cpmeans[i,] = c(1, cpvector[i],
    mean(ti.table[ti.table$cp == cpvector[i], 3]),
    mean(ti.table[ti.table$cp == cpvector[i], 4]),
    mean(ti.table[ti.table$cp == cpvector[i], 5]),
    mean(ti.table[ti.table$cp == cpvector[i], 6]))
  }

  # Sanity check – series look smoother
  ti.table.cpmeans

  par(mfrow=c(2,1))

  plot(ti.table.cpmeans$cp, ti.table.cpmeans$trainerr, pch=22, bg="blue", type="b",
    main="Avg Training vs Test Error by cp param", xlab="cp param", ylab="Error", ylim=c(.15, .21))
  points(ti.table.cpmeans$cp, ti.table.cpmeans$testerr, pch=22, bg="red", type="b")

  plot(ti.table.cpmeans$cp, ti.table.cpmeans$trainacc, pch=22, bg="blue", type="b",
    main="Avg Training vs Test Accuracy by cp param", xlab="cp param", ylab="Accuracy", ylim=c(.78, .86))
  points(ti.table.cpmeans$cp, ti.table.cpmeans$testacc, pch=22, bg="red", type="b")

  print('=============== END: jj72E ===============')  

}




#
# jj72
# -------
jj72 = function() {

  print('=============== MODEL: jj72 rpart trainV2 14VAR as factors ===============')  

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")


  ti.train$survived = as.factor(ti.train$survived)
  ti.train$sibsp = as.factor(ti.train$sibsp)
  ti.train$parch = as.factor(ti.train$parch)
  ti.train$pclass = as.factor(ti.train$pclass)
  ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
  ti.train$agebin3 = as.factor(ti.train$agebin3)
  ti.train$farebin3 = as.factor(ti.train$farebin3)

  # Adjusted variable "parch" from 9 to 6 in rows 344 and 367 of testV2.csv to normalize train/test factor vars

  # -------- PROPOSED Secret Sauce primer: ADD combo variables -----------

  ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
  colnames( ti.train)[14] = "sex_pclass"
  ti.train[,14] = as.factor( ti.train[,14])

  ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
  colnames(ti.train)[15] = "sex_agebin3"
  ti.train[,15] = as.factor(ti.train[,15])

  ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
  colnames(ti.train)[16] = "sex_farebin3"
  ti.train[,16] = as.factor(ti.train[,16])

  ti.train[,17] = paste(ti.train[,"sex"], ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[17] = "sex_cabinX"
  ti.train[,17] = as.factor(ti.train[,17])

  ti.train[,18] = paste(ti.train[,"pclass"],ifelse(ti.train$cabincode == 'X', 1, 0), sep="")
  colnames(ti.train)[18] = "pclass_cabinX"
  ti.train[,18] = as.factor(ti.train[,18])


  ti.tree = rpart(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 
                                     +sex_pclass +sex_agebin3 +sex_farebin3 +sex_cabinX +pclass_cabinX,  
    method = "class", 
    data = ti.train,
    cp = .00580,              ## cp = .015,   # cp = .01, 
    minsplit = 20 )

  par(mfrow=c(1,1))
  print(plot(ti.tree))
  print(text(ti.tree))

  ti.train.pred = predict(ti.tree, type = "class")

  ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
  print(ti.train.tab)
  print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))

  print('=============== MODEL: jj72 starting TEST predictions ===============')  

  # Load the testV2 TEST data - 
  # DATA PREP done in Excel
  ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")

  ti.test$sibsp = as.factor(ti.test$sibsp)
  ti.test$parch = as.factor(ti.test$parch)
  ti.test$pclass = as.factor(ti.test$pclass)
  ti.test$caibinlen5 = as.factor(ti.test$caibinlen5)
  ti.test$agebin3 = as.factor(ti.test$agebin3)
  ti.test$farebin3 = as.factor(ti.test$farebin3)

  # -------- PROPOSED Secret Sauce primer: ADD combo variables -----------

  ti.test[,14] = paste(ti.test[,"sex"], ti.test[,"pclass"], sep="")
  colnames( ti.test)[14] = "sex_pclass"
  ti.test[,14] = as.factor( ti.test[,14])

  ti.test[,15] = paste(ti.test[,"sex"], ti.test[,"agebin3"], sep="")
  colnames(ti.test)[15] = "sex_agebin3"
  ti.test[,15] = as.factor(ti.test[,15])

  ti.test[,16] = paste(ti.test[,"sex"], ti.test[,"farebin3"], sep="")
  colnames(ti.test)[16] = "sex_farebin3"
  ti.test[,16] = as.factor(ti.test[,16])

  ti.test[,17] = paste(ti.test[,"sex"], ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
  colnames(ti.test)[17] = "sex_cabinX"
  ti.test[,17] = as.factor(ti.test[,17])

  ti.test[,18] = paste(ti.test[,"pclass"],ifelse(ti.test$cabincode == 'X', 1, 0), sep="")
  colnames(ti.test)[18] = "pclass_cabinX"
  ti.test[,18] = as.factor(ti.test[,18])

  ti.test.pred = predict(ti.tree, ti.test, type = "class")

  # Save predictions
  write.csv(ti.test.pred, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj72C.csv", 
    row.names=FALSE, col.names=FALSE, quote = FALSE)

}






#
# jj71E - rpart with 12VAR COMBO features EVAL script - choose BEST cp value, then MAKE/use jj71 routine to create test predictions for SUBMIT
# -------            -------------------------

jj71E = function() {


  # accuracy w 1/3 holdout train data                      : train     test       nleaf

  # rpart 12VAR as-factors  minsplit = 20 BEST cp = 0.02   : 0.8350168 0.8202020   4

  # rpart 12VAR as-factors  minsplit = 15 BEST cp = 0.015  : 0.8441077 0.8208754   4  0.79904 << -- Kaggle public score as _jj71 new BEST! 11/27
  # rpart 12VAR as-factors  minsplit = 15 BEST cp = 0.01   : 0.8500000 0.8175084   8  
  # rpart 12VAR as-factors  minsplit = 15 BEST cp = 0.0068 : 0.8434343 0.8619529   8  0.78469 << -- Kaggle public score as _jj71A (tweaked to get 8 leaves)

  # * 9VAR  = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3
  # * 12VAR = sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3


  print('=============== MODEL: jj71E rpart trainV2 12VAR variations (xxxx)===============')  

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")


  ti.train$survived = as.factor(ti.train$survived)
  ti.train$sibsp = as.factor(ti.train$sibsp) 
  ti.train$parch = as.factor(ti.train$parch)
  ti.train$pclass = as.factor(ti.train$pclass)
  ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
  ti.train$agebin3 = as.factor(ti.train$agebin3)
  ti.train$farebin3 = as.factor(ti.train$farebin3)

  # -------- PROPOSED Secret Sauce primer: ADD combo variables -----------

  ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
  colnames( ti.train)[14] = "sex_pclass"
  ti.train[,14] = as.factor( ti.train[,14])

  ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
  colnames(ti.train)[15] = "sex_agebin3"
  ti.train[,15] = as.factor(ti.train[,15])

  ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
  colnames(ti.train)[16] = "sex_farebin3"
  ti.train[,16] = as.factor(ti.train[,16])


  # Build a table to collect data for the next two steps: colnames = batch, cp, trainerr, testerr
  batch = numeric()
  cp = numeric()
  trainerr = numeric()
  testerr = numeric()
  trainacc = numeric()
  testacc = numeric()
  ti.table = data.frame(batch=batch, cp=cp, trainerr=trainerr, testerr=testerr, trainacc=trainacc, testacc=testacc)

  # Make the seed vector to serve as unique sample seed and also as batch number 1-10
  seedvector = 1:10
  cpvector = .0200
  # cpvector = c(.0400, .0350, .0300, .0250, .0200, .0150, .0100, .0080, .0060, .0040)

  # Iterate over seedvector and cpvector to train and test batches of trees
  for (s in 1:length(seedvector)) {
    # Reset sample seed
    set.seed(seedvector[s])
    # Slice new training and test sets with same ratio
    testids = sample(nrow(ti.train), nrow(ti.train)/3)
    ti.testset = ti.train[testids, ]
    ti.trainset = ti.train[ - testids, ]
    # Iterate the cp values for this batch
    for (i in 1:length(cpvector)) {

      ti.tree = rpart(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3, 
        method = "class", 
        data = ti.trainset,
    cp=cpvector[i], 
    minsplit = 20 )

      # if (i %in% c(1,3,5,7)) { print(plot(ti.tree)); print(text(ti.tree)); }

      ti.test.predictions = predict(ti.tree, ti.testset, type = "class")
      ti.train.predictions = predict(ti.tree, type = "class")
      ti.train.error = sum(ti.train.predictions != ti.trainset$survived)/nrow(ti.trainset)
      ti.test.error = sum(ti.test.predictions != ti.testset$survived)/nrow(ti.testset)
      ti.train.acc = accuracyFrom2x2ConfusionMatrix(table(ti.train.predictions, ti.trainset$survived))
      ti.test.acc = accuracyFrom2x2ConfusionMatrix(table(ti.test.predictions, ti.testset$survived))
      # Populate a row of the table... s now serves as batch number in col 1

      ti.table[s*length(cpvector) -length(cpvector) +i,] = c(s, cpvector[i], ti.train.error, ti.test.error, ti.train.acc, ti.test.acc)
    }
  }
  # Sanity check of ti table looks good – we have 10 batches of 10 with same cp series
  dim(ti.table)
  ti.table[1:4,]
  ti.table[11:14,]

  # Use existing table as template to create a table for the train and test errors avg by cp
  ti.table.cpmeans = ti.table[1==0,] 
  # Empty table for cpmeans
  dim(ti.table.cpmeans)

  # Populate the means by cp
  for (i in 1:length(cpvector)) {
    ti.table.cpmeans[i,] = c(1, cpvector[i],
    mean(ti.table[ti.table$cp == cpvector[i], 3]),
    mean(ti.table[ti.table$cp == cpvector[i], 4]),
    mean(ti.table[ti.table$cp == cpvector[i], 5]),
    mean(ti.table[ti.table$cp == cpvector[i], 6]))
  }

  # Sanity check – series look smoother
  ti.table.cpmeans

  par(mfrow=c(2,1))

  plot(ti.table.cpmeans$cp, ti.table.cpmeans$trainerr, pch=22, bg="blue", type="b",
    main="Avg Training vs Test Error by cp param", xlab="cp param", ylab="Error", ylim=c(.15, .21))
  points(ti.table.cpmeans$cp, ti.table.cpmeans$testerr, pch=22, bg="red", type="b")

  plot(ti.table.cpmeans$cp, ti.table.cpmeans$trainacc, pch=22, bg="blue", type="b",
    main="Avg Training vs Test Accuracy by cp param", xlab="cp param", ylab="Accuracy", ylim=c(.78, .86))
  points(ti.table.cpmeans$cp, ti.table.cpmeans$testacc, pch=22, bg="red", type="b")

  print('=============== END: jj71E ===============')  

}



#
# jj71
# -------
jj71 = function() {

  print('=============== MODEL: jj71 rpart trainV2 12VAR as factors ===============')  

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")


  ti.train$survived = as.factor(ti.train$survived)
  ti.train$sibsp = as.factor(ti.train$sibsp)
  ti.train$parch = as.factor(ti.train$parch)
  ti.train$pclass = as.factor(ti.train$pclass)
  ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
  ti.train$agebin3 = as.factor(ti.train$agebin3)
  ti.train$farebin3 = as.factor(ti.train$farebin3)

  # Adjusted variable "parch" from 9 to 6 in rows 344 and 367 of testV2.csv to normalize train/test factor vars

  # -------- PROPOSED Secret Sauce primer: ADD combo variables -----------

  ti.train[,14] = paste(ti.train[,"sex"], ti.train[,"pclass"], sep="")
  colnames( ti.train)[14] = "sex_pclass"
  ti.train[,14] = as.factor( ti.train[,14])

  ti.train[,15] = paste(ti.train[,"sex"], ti.train[,"agebin3"], sep="")
  colnames(ti.train)[15] = "sex_agebin3"
  ti.train[,15] = as.factor(ti.train[,15])

  ti.train[,16] = paste(ti.train[,"sex"], ti.train[,"farebin3"], sep="")
  colnames(ti.train)[16] = "sex_farebin3"
  ti.train[,16] = as.factor(ti.train[,16])

  ti.tree = rpart(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3 +sex_pclass +sex_agebin3 +sex_farebin3, 
    method = "class", 
    data = ti.train,
    cp = .0068,           # cp = .01, 
    minsplit = 15 )

  par(mfrow=c(1,1))
  print(plot(ti.tree))
  print(text(ti.tree))

  ti.train.pred = predict(ti.tree, type = "class")

  ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
  print(ti.train.tab)
  print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))

  print('=============== MODEL: jj71 starting TEST predictions ===============')  

  # Load the testV2 TEST data - 
  # DATA PREP done in Excel
  ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")

  ti.test$sibsp = as.factor(ti.test$sibsp)
  ti.test$parch = as.factor(ti.test$parch)
  ti.test$pclass = as.factor(ti.test$pclass)
  ti.test$caibinlen5 = as.factor(ti.test$caibinlen5)
  ti.test$agebin3 = as.factor(ti.test$agebin3)
  ti.test$farebin3 = as.factor(ti.test$farebin3)

  # -------- PROPOSED Secret Sauce primer: ADD combo variables -----------

  ti.test[,14] = paste(ti.test[,"sex"], ti.test[,"pclass"], sep="")
  colnames( ti.test)[14] = "sex_pclass"
  ti.test[,14] = as.factor( ti.test[,14])

  ti.test[,15] = paste(ti.test[,"sex"], ti.test[,"agebin3"], sep="")
  colnames(ti.test)[15] = "sex_agebin3"
  ti.test[,15] = as.factor(ti.test[,15])

  ti.test[,16] = paste(ti.test[,"sex"], ti.test[,"farebin3"], sep="")
  colnames(ti.test)[16] = "sex_farebin3"
  ti.test[,16] = as.factor(ti.test[,16])

  ti.test.pred = predict(ti.tree, ti.test, type = "class")

  # Save predictions
  write.csv(ti.test.pred, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj71A.csv", 
    row.names=FALSE, col.names=FALSE, quote = FALSE)

}



#
# jj7E - rpart EVAL script - choose BEST cp value, then use jj7 routine to create test predictions for SUBMIT
# -------
jj7E = function() {


  # accuracy w 1/3 holdout train data               : train  test   nleaf
  # rpart as-factors  minsplit =  5 BEST cp = 0.004 : 0.8819 0.8242  23  0.76077 << -- Kaggle public score as _jj7B WAY OVERFIT!
  # rpart as-factors  minsplit = 10 BEST cp = 0.006 : 0.8641 0.8222  13
  # rpart as-factors  minsplit = 20 BEST cp = 0.02  : 0.8197 0.8118   4  0.78469 << -- Kaggle public score as _jj7
  # rpart as-factors  minsplit = 20 BEST cp = 0.01  : 0.8398 0.8087   8  
  # rpart as-factors  minsplit = 30 BEST cp = 0.025 : 0.8156 0.8107   4

  # rpart as-numerics minsplit = 20 BEST cp = 0.01  : 0.8107 0.8061   8
  # rpart as-numerics minsplit = 10 BEST cp = 0.015 : 0.8476 0.8087  12 (not the BEST test acc, but narrower train-test gap)


  print('=============== MODEL: jj7E rpart trainV2 variations (xxxx)===============')  

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")


  ti.train$survived = as.factor(ti.train$survived)
  ti.train$sibsp = as.factor(ti.train$sibsp) 
  ti.train$parch = as.factor(ti.train$parch)
  ti.train$pclass = as.factor(ti.train$pclass)
  ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
  ti.train$agebin3 = as.factor(ti.train$agebin3)
  ti.train$farebin3 = as.factor(ti.train$farebin3)

  # Build a table to collect data for the next two steps: colnames = batch, cp, trainerr, testerr
  batch = numeric()
  cp = numeric()
  trainerr = numeric()
  testerr = numeric()
  trainacc = numeric()
  testacc = numeric()
  ti.table = data.frame(batch=batch, cp=cp, trainerr=trainerr, testerr=testerr, trainacc=trainacc, testacc=testacc)

  # Make the seed vector to serve as unique sample seed and also as batch number 1-10
  seedvector = 1:10
  # cpvector = .01
  cpvector = c(.0400, .0350, .0300, .0250, .0200, .0150, .0100, .0080, .0060, .0040)

  # Iterate over seedvector and cpvector to train and test batches of trees
  for (s in 1:length(seedvector)) {
    # Reset sample seed
    set.seed(seedvector[s])
    # Slice new training and test sets with same ratio
    testids = sample(nrow(ti.train), nrow(ti.train)/3)
    ti.testset = ti.train[testids, ]
    ti.trainset = ti.train[ - testids, ]
    # Iterate the cp values for this batch
    for (i in 1:length(cpvector)) {

      ti.tree = rpart(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3, 
        method = "class", 
        data = ti.trainset,
    cp=cpvector[i], 
    minsplit = 20 )

      # if (i %in% c(1,3,5,7)) { print(plot(ti.tree)); print(text(ti.tree)); }

      ti.test.predictions = predict(ti.tree, ti.testset, type = "class")
      ti.train.predictions = predict(ti.tree, type = "class")
      ti.train.error = sum(ti.train.predictions != ti.trainset$survived)/nrow(ti.trainset)
      ti.test.error = sum(ti.test.predictions != ti.testset$survived)/nrow(ti.testset)
      ti.train.acc = accuracyFrom2x2ConfusionMatrix(table(ti.train.predictions, ti.trainset$survived))
      ti.test.acc = accuracyFrom2x2ConfusionMatrix(table(ti.test.predictions, ti.testset$survived))
      # Populate a row of the table... s now serves as batch number in col 1
      ti.table[s*10 -10 +i,] = c(s, cpvector[i], ti.train.error, ti.test.error, ti.train.acc, ti.test.acc)
    }
  }
  # Sanity check of ti table looks good – we have 10 batches of 10 with same cp series
  dim(ti.table)
  ti.table[1:4,]
  ti.table[11:14,]

  # Use existing table as template to create a table for the train and test errors avg by cp
  ti.table.cpmeans = ti.table[1==0,] 
  # Empty table for cpmeans
  dim(ti.table.cpmeans)

  # Populate the means by cp
  for (i in 1:length(cpvector)) {
    ti.table.cpmeans[i,] = c(1, cpvector[i],
    mean(ti.table[ti.table$cp == cpvector[i], 3]),
    mean(ti.table[ti.table$cp == cpvector[i], 4]),
    mean(ti.table[ti.table$cp == cpvector[i], 5]),
    mean(ti.table[ti.table$cp == cpvector[i], 6]))
  }

  # Sanity check – series look smoother
  ti.table.cpmeans

  par(mfrow=c(2,1))

  plot(ti.table.cpmeans$cp, ti.table.cpmeans$trainerr, pch=22, bg="blue", type="b",
    main="Avg Training vs Test Error by cp param", xlab="cp param", ylab="Error", ylim=c(.15, .21))
  points(ti.table.cpmeans$cp, ti.table.cpmeans$testerr, pch=22, bg="red", type="b")

  plot(ti.table.cpmeans$cp, ti.table.cpmeans$trainacc, pch=22, bg="blue", type="b",
    main="Avg Training vs Test Accuracy by cp param", xlab="cp param", ylab="Accuracy", ylim=c(.78, .86))
  points(ti.table.cpmeans$cp, ti.table.cpmeans$testacc, pch=22, bg="red", type="b")

  print('=============== END: jj7E ===============')  

}



#
# jj7
# -------
jj7 = function() {

  print('=============== MODEL: jj7 rpart trainV2 as factors (8182)===============')  

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")


  ti.train$survived = as.factor(ti.train$survived)
  ti.train$sibsp = as.factor(ti.train$sibsp)
  ti.train$parch = as.factor(ti.train$parch)
  ti.train$pclass = as.factor(ti.train$pclass)
  ti.train$caibinlen5 = as.factor(ti.train$caibinlen5)
  ti.train$agebin3 = as.factor(ti.train$agebin3)
  ti.train$farebin3 = as.factor(ti.train$farebin3)

  # Adjusted variable "parch" from 9 to 6 in rows 344 and 367 of testV2.csv to normalize train/test factor vars

  ti.tree = rpart(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3, 
    method = "class", data = ti.train, minsplit = 20, cp = 0.01)

  par(mfrow=c(1,1))
  print(plot(ti.tree))
  print(text(ti.tree))

  ti.train.pred = predict(ti.tree, type = "class")

  ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
  print(ti.train.tab)
  print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))

  print('=============== MODEL: jj7 starting TEST predictions ===============')  

  # Load the testV2 TEST data - 
  # DATA PREP done in Excel
  ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")

  ti.test$sibsp = as.factor(ti.test$sibsp)
  ti.test$parch = as.factor(ti.test$parch)
  ti.test$pclass = as.factor(ti.test$pclass)
  ti.test$caibinlen5 = as.factor(ti.test$caibinlen5)
  ti.test$agebin3 = as.factor(ti.test$agebin3)
  ti.test$farebin3 = as.factor(ti.test$farebin3)

  ti.test.pred = predict(ti.tree, ti.test, type = "class")

  # Save predictions
  write.csv(ti.test.pred, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj7B.csv", 
    row.names=FALSE, col.names=FALSE, quote = FALSE)

}




#
# jj6
# -------
jj6 = function() {

  print('=============== MODEL: jj6 randomForest trainV2 ===============')  

  # Load the trainV2 TRAINING data - 
  # DATA PREP done in Excel
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/trainV2.csv", header=TRUE, sep=",")

  # == Train and eval ==
  # first set response var to a factor so randomForest does classification instead of regression
  ti.train$survived = as.factor(ti.train$survived)
  library(randomForest)

  ti.rf = randomForest(survived ~ sex +sibsp +parch +pclass +embarked +cabincode +caibinlen5 +agebin3 +farebin3, 
        method = "class", 
    data = ti.train,
    ntree = 5000)

  # print(plot(ti.rf))
  ti.train.pred = predict(ti.rf)
  ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
  print(ti.train.tab)
  print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))

  print('=============== MODEL: jj6 starting TEST predictions ===============')  

  # Load the testV2 TEST data - 
  # DATA PREP done in Excel
  ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/testV2.csv", header=TRUE, sep=",")

  print("test predictions being output to ti.test.pred...")
  ti.test.pred = predict(ti.rf, 
    ti.test[, c("pclass", "sex", "sibsp", "parch", "embarked", "cabincode", "caibinlen5", "agebin3", "farebin3")] )

  # Save predictions
  write.csv(ti.test.pred, "file://C:/Users/Joe/My Documents/Titanic/testpredictions_jj6.csv", 
    row.names=FALSE, col.names=FALSE, quote = FALSE)

}



#
# jj5
# -------
jj5 = function() {

  print('=============== MODEL: jj5 randomForest 5 orig + 3 derived ===============')  

  # Load the TRAINING data
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/train.csv", header=TRUE, sep=",")

  # remove name col
  ti.train2 = ti.train[, ! colnames(ti.train) %in% "name"]

  # assign missing embarked values to most common factor
  # ti.train2[ti.train2$embarked=="", 10] = "S"

  # add derived cols
  ti.train2[, "cabinlisted"] = numeric()
  ti.train2[, "agebin"] = numeric()
  ti.train2[, "farebin"] = numeric()

  # assign cabinlisted col
  ti.train2[ti.train2$cabin == "", "cabinlisted"] = 0
  ti.train2[ti.train2$cabin != "", "cabinlisted"] = 1

  # assign agebin with binned values based on data inspection
  for (i in 1:length(ti.train2[,1])) {
    if(is.na(ti.train2[i, "age"]))     { ti.train2[i, "agebin"] = 50 }
    else if(ti.train2[i, "age"] <= 9)  { ti.train2[i, "agebin"] = 9  }
    else if(ti.train2[i, "age"] <= 24) { ti.train2[i, "agebin"] = 24 }
    else if(ti.train2[i, "age"] <= 50) { ti.train2[i, "agebin"] = 50 }
    else                               { ti.train2[i, "agebin"] = 99 }
  }

  # assign farebin with binned values based on data inspection
  for (i in 1:length(ti.train2[,1])) {
    if(is.na(ti.train2[i, "fare"]))        { ti.train2[i, "farebin"] = 2600 }
    else if(ti.train2[i, "fare"] <= 7.75)  { ti.train2[i, "farebin"] = 775 }
    else if(ti.train2[i, "fare"] <= 26.0)  { ti.train2[i, "farebin"] = 2600 }
    else                                   { ti.train2[i, "farebin"] = 9900 }
  }

  # == Train and eval ==
  # first set response var to a factor so randomForest does classification instead of regression
  ti.train2$survived = as.factor(ti.train2$survived)
  library(randomForest)
  ti.rf = randomForest(survived ~ sex+sibsp+parch+pclass+embarked+cabinlisted+farebin+agebin, 
        method = "class", 
    data = ti.train2, ntree = 500)
  print(plot(ti.rf))
  ti.train.pred = predict(ti.rf)
  ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
  print(ti.train.tab)
  print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))

  print('=============== MODEL: jj5 starting TEST predictions ===============')  

  # Load the TEST data
  ti.test = read.table("file://C:/Users/Joe/My Documents/Titanic/test.csv", header=TRUE, sep=",")

  # remove name col
  ti.test2 = ti.test[, ! colnames(ti.test) %in% "name"]

  # assign missing embarked values to most common factor
  # ti.test2[ti.test2$embarked=="", 9] = "S"

  # add derived cols
  ti.test2[, "cabinlisted"] = numeric()
  ti.test2[, "agebin"] = numeric()
  ti.test2[, "farebin"] = numeric()

  # assign cabinlisted col
  ti.test2[ti.test2$cabin == "", "cabinlisted"] = 0
  ti.test2[ti.test2$cabin != "", "cabinlisted"] = 1

  # assign agebin with binned values based on data inspection
  for (i in 1:length(ti.test2[,1])) {
    if(is.na(ti.test2[i, "age"]))     { ti.test2[i, "agebin"] = 50 }
    else if(ti.test2[i, "age"] <= 9)  { ti.test2[i, "agebin"] = 9  }
    else if(ti.test2[i, "age"] <= 24) { ti.test2[i, "agebin"] = 24 }
    else if(ti.test2[i, "age"] <= 50) { ti.test2[i, "agebin"] = 50 }
    else                               { ti.test2[i, "agebin"] = 99 }
  }

  # assign farebin with binned values based on data inspection
  for (i in 1:length(ti.test2[,1])) {
    if(is.na(ti.test2[i, "fare"]))        { ti.test2[i, "farebin"] = 2600 }
    else if(ti.test2[i, "fare"] <= 7.75)  { ti.test2[i, "farebin"] = 775 }
    else if(ti.test2[i, "fare"] <= 26.0)  { ti.test2[i, "farebin"] = 2600 }
    else                                   { ti.test2[i, "farebin"] = 9900 }
  }

  print("test predictions being output to ti.test.pred...")
  ti.test.pred = predict(ti.rf, 
    ti.test2[, c("pclass", "sex", "sibsp", "parch", "embarked", "cabinlisted", "farebin", "agebin")] )

  # Save predictions
  write.csv(ti.test.pred, "file://C:/Users/Joe/My Documents/Titanic/testpredictions.csv", 
    row.names=FALSE, col.names=FALSE, quote = FALSE)

}



#
# jj4
# -------
jj4 = function() {

  print('=============== MODEL: jj4 rpart 5 orig + 3 derived ===============')  
  # Load the data
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/train.csv", header=TRUE, sep=",")

  # remove name col
  ti.train2 = ti.train[, ! colnames(ti.train) %in% "name"]

  # add derived cols
  ti.train2[, "cabinlisted"] = numeric()
  ti.train2[, "agebin"] = numeric()
  ti.train2[, "farebin"] = numeric()

  # assign cabinlisted col
  ti.train2[ti.train2$cabin == "", "cabinlisted"] = 0
  ti.train2[ti.train2$cabin != "", "cabinlisted"] = 1

  # assign agebin with binned values based on data inspection
  for (i in 1:length(ti.train2[,1])) {
    if(is.na(ti.train2[i, "age"]))     { ti.train2[i, "agebin"] = 50 }
    else if(ti.train2[i, "age"] <= 9)  { ti.train2[i, "agebin"] = 9  }
    else if(ti.train2[i, "age"] <= 24) { ti.train2[i, "agebin"] = 24 }
    else if(ti.train2[i, "age"] <= 50) { ti.train2[i, "agebin"] = 50 }
    else                               { ti.train2[i, "agebin"] = 99 }
  }

  # assign farebin with binned values based on data inspection
  for (i in 1:length(ti.train2[,1])) {
    if(is.na(ti.train2[i, "fare"]))        { ti.train2[i, "farebin"] = 2600 }
    else if(ti.train2[i, "fare"] <= 7.75)  { ti.train2[i, "farebin"] = 775 }
    else if(ti.train2[i, "fare"] <= 26.0)  { ti.train2[i, "farebin"] = 2600 }
    else                                   { ti.train2[i, "farebin"] = 9900 }
  }

  # Train and eval
  ti.tree = rpart(survived ~ sex+sibsp+parch+pclass+embarked+cabinlisted+farebin+agebin, method = "class", data = ti.train2)
  print(plot(ti.tree))
  print(text(ti.tree))
  ti.train.pred = predict(ti.tree, type = "class")
  ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
  print(ti.train.tab)
  print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
}




#
# jj3
# -------
jj3 = function() {

  print('=============== MODEL: jj3 rpart sex+sibsp+parch ===============')  
  # Load the data
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/train.csv", header=TRUE, sep=",")
  ti.tree = rpart(survived ~ sex+sibsp+parch, method = "class", data = ti.train)
  print(plot(ti.tree))
  print(text(ti.tree))
  ti.train.pred = predict(ti.tree, type = "class")
  ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
  print(ti.train.tab)
  print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
}


#
# jj2
# -------
jj2 = function() {

  print('=============== MODEL: jj2 rpart sex+sibsp===============')  
  # Load the data
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/train.csv", header=TRUE, sep=",")
  ti.tree = rpart(survived ~ sex+sibsp, method = "class", data = ti.train)
  print(plot(ti.tree))
  print(text(ti.tree))
  ti.train.pred = predict(ti.tree, type = "class")
  ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
  print(ti.train.tab)
  print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
}


#
# jj1
# -------
jj1 = function() {

  print('=============== MODEL: jj1 rpart sex+pclass ===============')  
  # Load the data
  ti.train = read.table("file://C:/Users/Joe/My Documents/Titanic/train.csv", header=TRUE, sep=",")
  ti.tree = rpart(survived ~ sex+pclass, method = "class", data = ti.train)
  ti.train.pred = predict(ti.tree, type = "class")
  ti.train.tab = table(ti.train.pred, ti.train$survived, dnn=c("predicted","actual"))
  print(ti.train.tab)
  print(accuracyFrom2x2ConfusionMatrix(ti.train.tab))
}


#
# accuracyFrom2x2ConfusionMatrix
# -------------------------------
accuracyFrom2x2ConfusionMatrix = function(tb) {
  # Check size of input
  if(length(dim(tb)) != 2) return(-1)
  if(dim(tb)[1] != 2) return(-1)
  if(dim(tb)[2] != 2) return(-1)
  # Return accuracy calc
  return((tb[1,1] + tb[2,2]) / sum(tb))
}