Feature engineering

##    
##     FALSE TRUE
##   0   135   30
##   1    30   67

## TPR:  0.690721649484536

## FPR:  0.181818181818182

## Cost:  90

## Optimal Threshold:  0.58

## TPR:  0.65979381443299

## FPR:  0.096969696969697

## Area under the curve: 0.819

##                  [,1]
## sensitivity 0.7422680
## specificity 0.7878788
## cutoff      0.4175980

## Optimal TPR:  0.635348043847331

getwd()

## [1] "C:/Users/jorn_/Desktop/IE/MBD Period 2/Machine learning II/Tit"

setwd("C:/Users/jorn_/Desktop/IE/MBD Period 2/Machine learning II/Tit/")
df <- read.csv('titanic.csv', sep=';', header=T)
colnames(df)[1] <- 'pclass'
df$pclass <- as.factor(df$pclass)
df$survived <- as.factor(df$survived)
df$sex <- as.factor(df$sex)
df$age <- as.numeric(df$age)
df$fare <- as.numeric(df$fare)
df <- df[complete.cases(df),]

df$embarked[df$embarked==""] <- NA
df[is.na(df$embarked),]

##     pclass survived    sex age sibsp parch ticket fare cabin embarked
## 169      1        1 female  53     0     0 113572  258   B28     <NA>
## 285      1        1 female  85     0     0 113572  258   B28     <NA>

ggplot(df[df$pclass == '1' & df$fare > 200, ], 
  aes(x = embarked)) +
  geom_density(fill = '#99d6ff', alpha=0.4) + 
  geom_vline(aes(xintercept=median(fare, na.rm=T)),
    colour='red', linetype='dashed', lwd=1)

## As we can see this group has the highest density in C so we give these NAs the embarked C
sapply(df, function(x) sum(is.na(x)))

##   pclass survived      sex      age    sibsp    parch   ticket     fare 
##        0        0        0        0        0        0        0        0 
##    cabin embarked 
##        0        2

df$embarked[is.na(df$embarked)] <- 'C'
str(df)

## 'data.frame':    1309 obs. of  10 variables:
##  $ pclass  : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ survived: Factor w/ 2 levels "0","1": 2 2 1 1 1 2 2 1 2 1 ...
##  $ sex     : Factor w/ 3 levels "","female","male": 2 3 2 3 2 3 2 3 2 3 ...
##  $ age     : num  39 8 23 41 33 67 86 55 73 94 ...
##  $ sibsp   : int  0 1 1 1 1 0 1 0 2 0 ...
##  $ parch   : int  0 2 2 2 2 0 0 0 0 0 ...
##  $ ticket  : Factor w/ 930 levels "","110152","110413",..: 189 51 51 51 51 126 94 17 78 827 ...
##  $ fare    : num  79 58 58 58 58 103 236 2 157 153 ...
##  $ cabin   : Factor w/ 187 levels "","A10","A11",..: 45 81 81 81 81 151 147 17 63 1 ...
##  $ embarked: Factor w/ 4 levels "","C","Q","S": 4 4 4 4 4 4 4 4 4 2 ...

## Therefor I filtered out the 1 year olds
qplot(df$age, bins = 15)

df <- df[df$age != 1,]
qplot(df$age, bins = 15, col = df$sex)

df <- df %>%
  subset(select=c(-ticket, -cabin))
d = qplot(df$age, fill = df$survived) + facet_grid(~df$sex)
d

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot(df$survived, df$sex, xlab = "Sex", ylab = "Survived", )

## Next I took a look at the factors parch and sibsp, can these predict MAYBE DO CATEGORIES

df$Fsize <- df$sibsp + df$parch + 1

ggplot(df, aes(x = Fsize, fill = factor(df$survived))) +
  geom_bar(stat='count', position='dodge') +
  scale_x_continuous(breaks=c(1:11)) +
  labs(x = 'Family Size')

df$FsizeD[df$Fsize == 1] <- 'singleton'
df$FsizeD[df$Fsize < 5 & df$Fsize > 1] <- 'small'
df$FsizeD[df$Fsize > 4] <- 'large'

ggplot(df, aes(x = embarked, y = fare, fill = factor(pclass))) +
  geom_boxplot() +
  geom_hline(aes(yintercept=80), 
    colour='red', linetype='dashed', lwd=2)

## We want to check the point-biserial on Survived to see the correlation

b1 <- biserial.cor(df$age, df$survived, use = c("all.obs", "complete.obs"), level = 1)
b2 <- biserial.cor(df$sibsp, df$survived, use = c("all.obs", "complete.obs"), level = 1)
b3 <- biserial.cor(df$Fsize, df$survived, use = c("all.obs", "complete.obs"), level = 1)
b4 <- biserial.cor(df$parch, df$survived, use = c("all.obs", "complete.obs"), level = 1)
b5 <- biserial.cor(df$fare, df$survived, use = c("all.obs", "complete.obs"), level = 1)

bcol <- c(b1,b2,b3,b4,b5)
plot(bcol)

## just a quick look at the sex correlation to surviving
ggplot(df, aes(age, fill = factor(survived))) + 
  geom_histogram() + 
  facet_grid(.~sex)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## next we take a look at the fare density 

lmmod <- function(df, size = 10)
pred <- lmmod(df, size = 10)

## after looking at the boxplot You can clearly see that Fare is to be standardized. Though, there is a suspicion that Fare and pclass are correlated. 

View(df)
att.scores <- random.forest.importance(survived ~ ., df )

splitdf <- function(dataframe, seed=NULL, percentage=0.8) {
  if (!is.null(seed)) set.seed(seed)
  index <- 1:nrow(dataframe)
  numTrainingSamples <- round(length(index) * percentage)
  trainindex <- sample(index, numTrainingSamples)
  trainset <- dataframe[trainindex, ]
  testset <- dataframe[-trainindex, ]
  list(trainset=trainset,testset=testset)
}

confusionMatrix <- function(predictions, threshold) {
  table(predictions$survived, factor((predictions$pred > threshold),levels=c("FALSE","TRUE")))
}

tpr <- function(predictions, threshold) {
  confMat <- confusionMatrix(predictions, threshold)
  confMat["1", "TRUE"] / sum(confMat["1", ])  # True Positive Rate = TP/TP+FN
}


fpr <- function(predictions, threshold) {
  confMat <- confusionMatrix(predictions, threshold)
  confMat["0", "TRUE"] / sum(confMat["0", ])  # False Positive Rate = FP/FP+TN 
}


cost <- function(predictions, threshold, fpCost=1, fnCost=2) {
  confMat <- confusionMatrix(predictions, threshold)
  (confMat["0", "TRUE"] * fpCost) + (confMat["1", "FALSE"] * fnCost)
}

autoCut <- function(dataframe, partition = 5){
 simple = as.simple.formula(cutoff.k(random.forest.importance(survived ~ ., dataframe),partition), "survived") 
 
 glm(simple, data = df, family = "binomial")
}

modelEvaluation <- function(split) {
model <-glm(survived~., data = split$trainset, family = "binomial")

probs <-predict(model, type="response", newdata = split$testset)

predictions <-data.frame(survived = split$testset$survived, pred=probs)
myROC <-roc(survived ~ probs, predictions)
optimalThreshold <-coords(myROC, "best", ret = "threshold")

T <-table(predictions$survived, predictions$pred > optimalThreshold)
F1 <- (2*(T[1,1]))/((2*(T[1,1]))+T[2,1]+T[1,2])
F1
}

## Question 1 

dhhd <- function(df){
  
  data = splitdf(df)
  training = data$trainset
  testing = data$testset
  
  glmmodel = autoCut(df) 
  
  probs <- predict(glmmodel, type="response", newdata = testing)

  predictions <- data.frame(survived=testing$survived, pred=probs)
  
  pred = prediction(predictions$pred, predictions$survived)
  perf <- performance(pred, "tpr", "fpr") 
  plot(perf, type="b", colorize=T)
    
  roc <- data.frame(threshold = (1:99)/100, tpr=NA, fpr=NA)
  roc$tpr <-  sapply(roc$threshold, function(th) tpr(predictions, th))
  roc$fpr <-  sapply(roc$threshold, function(th) fpr(predictions, th))
  roc$cost <- sapply(roc$threshold, function(th) cost(predictions, th))

  idx_threshold <<- which.min(abs(roc$cost))
  idx_threshold <<- idx_threshold/100

  confusionMatrix(predictions,idx_threshold)
  
  randfor<<- random.forest.importance(survived~., df)
  FeatureSec <<- cutoff.k(randfor, 6)
  FeatureSec <<- paste(FeatureSec, sep = " ", collapse=" ")
  
  itpr <<- tpr(predictions, idx_threshold)
  ifpr <<- fpr(predictions, idx_threshold)
  icost <<- cost(predictions, idx_threshold)
  iauc <<- auc(predictions$survived,predictions$pred)
  
  {
  rbPal <- colorRampPalette(c('green','red'))
  roc$col <- rbPal(10)[as.numeric(cut(roc$cost, breaks = 10))]
  par(mfrow=c(1,2))
  # This one is for the ROC (FPR vs. TPR)
  plot(roc$fpr, roc$tpr, xlab="False Positive Rate", ylab="True Positive Rate",
       type="b", cex=1.5, lwd=2, col=adjustcolor(roc$col, alpha=0.5), pch=16, xlim=c(0,1), ylim=c(0,1))
  abline(0,1,lty=2, lwd=2)
  abline(v=roc$fpr[idx_threshold], lty=2, lwd=1, col="grey"); 
  abline(h=roc$tpr[idx_threshold], lty=2, lwd=1, col="grey")
  # This one is for the cost.
  plot(roc$threshold, roc$cost, xlab="Threshold", ylab="Cost", 
       type="b", cex=1.5, lwd=2, pch=16,xlim=c(0,1), col=adjustcolor(roc$col, alpha=0.5))
  abline(v=roc$threshold[idx_threshold], lty=2, lwd=2, col="grey")
  }

  cat(paste("TPR: ",tpr(predictions, idx_threshold),"\n"))
  
  cat(paste("AUC: ",auc(predictions$survived,predictions$pred),"\n"))
  
  cat(paste("FPR: ",fpr(predictions, idx_threshold),"\n"))
  
  cat(paste("Cost: ",cost(predictions, idx_threshold, 1, 2),"\n"))
  
}

mat <- data.frame(threshold = NA, tpr = NA, fpr = NA, cost = NA, Feature = NA, AUC = NA)

for (i in 1:10){
  dhhd(df)
  mat[i,] <- cbind(idx_threshold, itpr, ifpr, icost, FeatureSec, iauc)
  print(FeatureSec)
  i = i+1
  }

## TPR:  0.746987951807229 
## AUC:  0.83744501816791 
## FPR:  0.142857142857143 
## Cost:  60 
## [1] "sex pclass FsizeD fare embarked Fsize"

## TPR:  0.694736842105263 
## AUC:  0.796999076638966 
## FPR:  0.149122807017544 
## Cost:  75 
## [1] "sex pclass FsizeD fare Fsize parch"

## TPR:  0.734939759036145 
## AUC:  0.804886211512718 
## FPR:  0.19047619047619 
## Cost:  68 
## [1] "sex pclass FsizeD fare Fsize embarked"

## TPR:  0.771084337349398 
## AUC:  0.828647925033467 
## FPR:  0.19047619047619 
## Cost:  62 
## [1] "sex pclass fare FsizeD embarked age"

## TPR:  0.691358024691358 
## AUC:  0.798466435185185 
## FPR:  0.171875 
## Cost:  72 
## [1] "sex pclass fare FsizeD Fsize embarked"

## TPR:  0.853333333333333 
## AUC:  0.841990049751244 
## FPR:  0.298507462686567 
## Cost:  62 
## [1] "sex pclass fare FsizeD Fsize embarked"

## TPR:  0.753246753246753 
## AUC:  0.838547815820543 
## FPR:  0.159090909090909 
## Cost:  59 
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR:  0.862068965517241 
## AUC:  0.84887883926889 
## FPR:  0.278688524590164 
## Cost:  58 
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR:  0.793478260869565 
## AUC:  0.772528799702713 
## FPR:  0.401709401709402 
## Cost:  85 
## [1] "sex pclass FsizeD fare Fsize parch"

## TPR:  0.789473684210526 
## AUC:  0.857439651760981 
## FPR:  0.195488721804511 
## Cost:  58 
## [1] "sex pclass fare FsizeD Fsize parch"

##Question 2

splitting_and_testing <- function(df)
  {
  data = splitdf(df,percentage=0.6)
  temp=splitdf(data$testset,percentage= 0.5)
  test = temp$testset
  training = data$trainset
  threshold = temp$trainset

  glmmodel = autoCut(df) 
  
  probs <- predict(glmmodel, type="response", newdata = training)

  predictions <- data.frame(survived=training$survived, pred=probs)
  
  roc <- data.frame(threshold = (1:99)/100, tpr=NA, fpr=NA)
  roc$tpr <-  sapply(roc$threshold, function(th) tpr(predictions, th))
  roc$fpr <-  sapply(roc$threshold, function(th) fpr(predictions, th))
  roc$cost <- sapply(roc$threshold, function(th) cost(predictions, th))
  
  idx_threshold = which.min(abs(roc$cost))
  idx_threshold = idx_threshold/100

  probs2 <- predict(glmmodel, type="response", newdata = test)

  predictions2 <- data.frame(survived=test$survived, pred=probs2)
  
  randfor<<- random.forest.importance(survived~., df)
  FeatureSec1 <<- cutoff.k(randfor, 5)
  FeatureSec1 <<- paste(FeatureSec1, sep = " ", collapse=" ")
  
  itpr1 <<- tpr(predictions, idx_threshold)
  ifpr1 <<- fpr(predictions, idx_threshold)
  icost1 <<- cost(predictions, idx_threshold)
  
  {
  rbPal <- colorRampPalette(c('green','red'))
  roc$col <- rbPal(10)[as.numeric(cut(roc$cost, breaks = 10))]
  par(mfrow=c(1,2))
  # This one is for the ROC (FPR vs. TPR)
  plot(roc$fpr, roc$tpr, xlab="False Positive Rate", ylab="True Positive Rate",
       type="b", cex=1.5, lwd=2, col=adjustcolor(roc$col, alpha=0.5), pch=16, xlim=c(0,1), ylim=c(0,1))
  abline(0,1,lty=2, lwd=2)
  abline(v=roc$fpr[idx_threshold], lty=2, lwd=1, col="grey"); 
  abline(h=roc$tpr[idx_threshold], lty=2, lwd=1, col="grey")
  # This one is for the cost.
  plot(roc$threshold, roc$cost, xlab="Threshold", ylab="Cost", 
       type="b", cex=1.5, lwd=2, pch=16,xlim=c(0,1), col=adjustcolor(roc$col, alpha=0.5))
  abline(v=roc$threshold[idx_threshold], lty=2, lwd=2, col="grey")
}
  confusionMatrix(predictions2,idx_threshold)

  cat(paste("TPR: ",tpr(predictions2, idx_threshold),"\n"))
  
  cat(paste("FPR: ",fpr(predictions2, idx_threshold),"\n"))
  
  cat(paste("Cost: ",cost(predictions2, idx_threshold, 1, 2),"\n"))

}

matcv <- data.frame(threshold = NA, tpr = NA, fpr = NA, cost = NA, Feature = NA)

for (i in 1:10){
  splitting_and_testing(df)
  matcv[i,] <- cbind(idx_threshold, itpr1, ifpr1, icost1, FeatureSec1)
  print(FeatureSec)
  i = i+1
}

## TPR:  0.729411764705882 
## FPR:  0.17741935483871 
## Cost:  68 
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR:  0.815217391304348 
## FPR:  0.290598290598291 
## Cost:  68 
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR:  0.644736842105263 
## FPR:  0.24812030075188 
## Cost:  87 
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR:  0.865853658536585 
## FPR:  0.173228346456693 
## Cost:  44 
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR:  0.722222222222222 
## FPR:  0.218487394957983 
## Cost:  76 
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR:  0.893617021276596 
## FPR:  0.234782608695652 
## Cost:  47 
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR:  0.69620253164557 
## FPR:  0.161538461538462 
## Cost:  69 
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR:  0.855555555555556 
## FPR:  0.369747899159664 
## Cost:  70 
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR:  0.808988764044944 
## FPR:  0.133333333333333 
## Cost:  50 
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR:  0.771739130434783 
## FPR:  0.128205128205128 
## Cost:  57 
## [1] "sex pclass fare FsizeD Fsize parch"

View(matcv)

train_control <- trainControl(method="cv", number=10)
grid <- expand.grid(.fL=c(0), .usekernel=c(FALSE))
model <- train(survived~pclass+sex+age+sibsp+fare+embarked+parch, data=df, family = "binomial", trControl = trainControl(method = "cv", number = 10, verboseIter = TRUE))

## Loading required package: randomForest

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

## + Fold01: mtry= 2 
## - Fold01: mtry= 2 
## + Fold01: mtry= 6 
## - Fold01: mtry= 6 
## + Fold01: mtry=11 
## - Fold01: mtry=11 
## + Fold02: mtry= 2 
## - Fold02: mtry= 2 
## + Fold02: mtry= 6 
## - Fold02: mtry= 6 
## + Fold02: mtry=11 
## - Fold02: mtry=11 
## + Fold03: mtry= 2 
## - Fold03: mtry= 2 
## + Fold03: mtry= 6 
## - Fold03: mtry= 6 
## + Fold03: mtry=11 
## - Fold03: mtry=11 
## + Fold04: mtry= 2 
## - Fold04: mtry= 2 
## + Fold04: mtry= 6 
## - Fold04: mtry= 6 
## + Fold04: mtry=11 
## - Fold04: mtry=11 
## + Fold05: mtry= 2 
## - Fold05: mtry= 2 
## + Fold05: mtry= 6 
## - Fold05: mtry= 6 
## + Fold05: mtry=11 
## - Fold05: mtry=11 
## + Fold06: mtry= 2 
## - Fold06: mtry= 2 
## + Fold06: mtry= 6 
## - Fold06: mtry= 6 
## + Fold06: mtry=11 
## - Fold06: mtry=11 
## + Fold07: mtry= 2 
## - Fold07: mtry= 2 
## + Fold07: mtry= 6 
## - Fold07: mtry= 6 
## + Fold07: mtry=11 
## - Fold07: mtry=11 
## + Fold08: mtry= 2 
## - Fold08: mtry= 2 
## + Fold08: mtry= 6 
## - Fold08: mtry= 6 
## + Fold08: mtry=11 
## - Fold08: mtry=11 
## + Fold09: mtry= 2 
## - Fold09: mtry= 2 
## + Fold09: mtry= 6 
## - Fold09: mtry= 6 
## + Fold09: mtry=11 
## - Fold09: mtry=11 
## + Fold10: mtry= 2 
## - Fold10: mtry= 2 
## + Fold10: mtry= 6 
## - Fold10: mtry= 6 
## + Fold10: mtry=11 
## - Fold10: mtry=11 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set

print(model)

## Random Forest 
## 
## 1046 samples
##    7 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 941, 941, 941, 942, 942, 942, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.7876648  0.5413655
##    6    0.7857692  0.5471661
##   11    0.7847985  0.5469944
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 2.

Feature engineering

Jorn

February 3, 2017