##
## FALSE TRUE
## 0 135 30
## 1 30 67
## TPR: 0.690721649484536
## FPR: 0.181818181818182
## Cost: 90
## Optimal Threshold: 0.58
## TPR: 0.65979381443299
## FPR: 0.096969696969697
## Area under the curve: 0.819
## [,1]
## sensitivity 0.7422680
## specificity 0.7878788
## cutoff 0.4175980
## Optimal TPR: 0.635348043847331
getwd()
## [1] "C:/Users/jorn_/Desktop/IE/MBD Period 2/Machine learning II/Tit"
setwd("C:/Users/jorn_/Desktop/IE/MBD Period 2/Machine learning II/Tit/")
df <- read.csv('titanic.csv', sep=';', header=T)
colnames(df)[1] <- 'pclass'
df$pclass <- as.factor(df$pclass)
df$survived <- as.factor(df$survived)
df$sex <- as.factor(df$sex)
df$age <- as.numeric(df$age)
df$fare <- as.numeric(df$fare)
df <- df[complete.cases(df),]
df$embarked[df$embarked==""] <- NA
df[is.na(df$embarked),]
## pclass survived sex age sibsp parch ticket fare cabin embarked
## 169 1 1 female 53 0 0 113572 258 B28 <NA>
## 285 1 1 female 85 0 0 113572 258 B28 <NA>
ggplot(df[df$pclass == '1' & df$fare > 200, ],
aes(x = embarked)) +
geom_density(fill = '#99d6ff', alpha=0.4) +
geom_vline(aes(xintercept=median(fare, na.rm=T)),
colour='red', linetype='dashed', lwd=1)

## As we can see this group has the highest density in C so we give these NAs the embarked C
sapply(df, function(x) sum(is.na(x)))
## pclass survived sex age sibsp parch ticket fare
## 0 0 0 0 0 0 0 0
## cabin embarked
## 0 2
df$embarked[is.na(df$embarked)] <- 'C'
str(df)
## 'data.frame': 1309 obs. of 10 variables:
## $ pclass : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ survived: Factor w/ 2 levels "0","1": 2 2 1 1 1 2 2 1 2 1 ...
## $ sex : Factor w/ 3 levels "","female","male": 2 3 2 3 2 3 2 3 2 3 ...
## $ age : num 39 8 23 41 33 67 86 55 73 94 ...
## $ sibsp : int 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : int 0 2 2 2 2 0 0 0 0 0 ...
## $ ticket : Factor w/ 930 levels "","110152","110413",..: 189 51 51 51 51 126 94 17 78 827 ...
## $ fare : num 79 58 58 58 58 103 236 2 157 153 ...
## $ cabin : Factor w/ 187 levels "","A10","A11",..: 45 81 81 81 81 151 147 17 63 1 ...
## $ embarked: Factor w/ 4 levels "","C","Q","S": 4 4 4 4 4 4 4 4 4 2 ...
## Therefor I filtered out the 1 year olds
qplot(df$age, bins = 15)

df <- df[df$age != 1,]
qplot(df$age, bins = 15, col = df$sex)

df <- df %>%
subset(select=c(-ticket, -cabin))
d = qplot(df$age, fill = df$survived) + facet_grid(~df$sex)
d
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot(df$survived, df$sex, xlab = "Sex", ylab = "Survived", )

## Next I took a look at the factors parch and sibsp, can these predict MAYBE DO CATEGORIES
df$Fsize <- df$sibsp + df$parch + 1
ggplot(df, aes(x = Fsize, fill = factor(df$survived))) +
geom_bar(stat='count', position='dodge') +
scale_x_continuous(breaks=c(1:11)) +
labs(x = 'Family Size')

df$FsizeD[df$Fsize == 1] <- 'singleton'
df$FsizeD[df$Fsize < 5 & df$Fsize > 1] <- 'small'
df$FsizeD[df$Fsize > 4] <- 'large'
ggplot(df, aes(x = embarked, y = fare, fill = factor(pclass))) +
geom_boxplot() +
geom_hline(aes(yintercept=80),
colour='red', linetype='dashed', lwd=2)

## We want to check the point-biserial on Survived to see the correlation
b1 <- biserial.cor(df$age, df$survived, use = c("all.obs", "complete.obs"), level = 1)
b2 <- biserial.cor(df$sibsp, df$survived, use = c("all.obs", "complete.obs"), level = 1)
b3 <- biserial.cor(df$Fsize, df$survived, use = c("all.obs", "complete.obs"), level = 1)
b4 <- biserial.cor(df$parch, df$survived, use = c("all.obs", "complete.obs"), level = 1)
b5 <- biserial.cor(df$fare, df$survived, use = c("all.obs", "complete.obs"), level = 1)
bcol <- c(b1,b2,b3,b4,b5)
plot(bcol)

## just a quick look at the sex correlation to surviving
ggplot(df, aes(age, fill = factor(survived))) +
geom_histogram() +
facet_grid(.~sex)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## next we take a look at the fare density
lmmod <- function(df, size = 10)
pred <- lmmod(df, size = 10)
## after looking at the boxplot You can clearly see that Fare is to be standardized. Though, there is a suspicion that Fare and pclass are correlated.
View(df)
att.scores <- random.forest.importance(survived ~ ., df )
splitdf <- function(dataframe, seed=NULL, percentage=0.8) {
if (!is.null(seed)) set.seed(seed)
index <- 1:nrow(dataframe)
numTrainingSamples <- round(length(index) * percentage)
trainindex <- sample(index, numTrainingSamples)
trainset <- dataframe[trainindex, ]
testset <- dataframe[-trainindex, ]
list(trainset=trainset,testset=testset)
}
confusionMatrix <- function(predictions, threshold) {
table(predictions$survived, factor((predictions$pred > threshold),levels=c("FALSE","TRUE")))
}
tpr <- function(predictions, threshold) {
confMat <- confusionMatrix(predictions, threshold)
confMat["1", "TRUE"] / sum(confMat["1", ]) # True Positive Rate = TP/TP+FN
}
fpr <- function(predictions, threshold) {
confMat <- confusionMatrix(predictions, threshold)
confMat["0", "TRUE"] / sum(confMat["0", ]) # False Positive Rate = FP/FP+TN
}
cost <- function(predictions, threshold, fpCost=1, fnCost=2) {
confMat <- confusionMatrix(predictions, threshold)
(confMat["0", "TRUE"] * fpCost) + (confMat["1", "FALSE"] * fnCost)
}
autoCut <- function(dataframe, partition = 5){
simple = as.simple.formula(cutoff.k(random.forest.importance(survived ~ ., dataframe),partition), "survived")
glm(simple, data = df, family = "binomial")
}
modelEvaluation <- function(split) {
model <-glm(survived~., data = split$trainset, family = "binomial")
probs <-predict(model, type="response", newdata = split$testset)
predictions <-data.frame(survived = split$testset$survived, pred=probs)
myROC <-roc(survived ~ probs, predictions)
optimalThreshold <-coords(myROC, "best", ret = "threshold")
T <-table(predictions$survived, predictions$pred > optimalThreshold)
F1 <- (2*(T[1,1]))/((2*(T[1,1]))+T[2,1]+T[1,2])
F1
}
## Question 1
dhhd <- function(df){
data = splitdf(df)
training = data$trainset
testing = data$testset
glmmodel = autoCut(df)
probs <- predict(glmmodel, type="response", newdata = testing)
predictions <- data.frame(survived=testing$survived, pred=probs)
pred = prediction(predictions$pred, predictions$survived)
perf <- performance(pred, "tpr", "fpr")
plot(perf, type="b", colorize=T)
roc <- data.frame(threshold = (1:99)/100, tpr=NA, fpr=NA)
roc$tpr <- sapply(roc$threshold, function(th) tpr(predictions, th))
roc$fpr <- sapply(roc$threshold, function(th) fpr(predictions, th))
roc$cost <- sapply(roc$threshold, function(th) cost(predictions, th))
idx_threshold <<- which.min(abs(roc$cost))
idx_threshold <<- idx_threshold/100
confusionMatrix(predictions,idx_threshold)
randfor<<- random.forest.importance(survived~., df)
FeatureSec <<- cutoff.k(randfor, 6)
FeatureSec <<- paste(FeatureSec, sep = " ", collapse=" ")
itpr <<- tpr(predictions, idx_threshold)
ifpr <<- fpr(predictions, idx_threshold)
icost <<- cost(predictions, idx_threshold)
iauc <<- auc(predictions$survived,predictions$pred)
{
rbPal <- colorRampPalette(c('green','red'))
roc$col <- rbPal(10)[as.numeric(cut(roc$cost, breaks = 10))]
par(mfrow=c(1,2))
# This one is for the ROC (FPR vs. TPR)
plot(roc$fpr, roc$tpr, xlab="False Positive Rate", ylab="True Positive Rate",
type="b", cex=1.5, lwd=2, col=adjustcolor(roc$col, alpha=0.5), pch=16, xlim=c(0,1), ylim=c(0,1))
abline(0,1,lty=2, lwd=2)
abline(v=roc$fpr[idx_threshold], lty=2, lwd=1, col="grey");
abline(h=roc$tpr[idx_threshold], lty=2, lwd=1, col="grey")
# This one is for the cost.
plot(roc$threshold, roc$cost, xlab="Threshold", ylab="Cost",
type="b", cex=1.5, lwd=2, pch=16,xlim=c(0,1), col=adjustcolor(roc$col, alpha=0.5))
abline(v=roc$threshold[idx_threshold], lty=2, lwd=2, col="grey")
}
cat(paste("TPR: ",tpr(predictions, idx_threshold),"\n"))
cat(paste("AUC: ",auc(predictions$survived,predictions$pred),"\n"))
cat(paste("FPR: ",fpr(predictions, idx_threshold),"\n"))
cat(paste("Cost: ",cost(predictions, idx_threshold, 1, 2),"\n"))
}
mat <- data.frame(threshold = NA, tpr = NA, fpr = NA, cost = NA, Feature = NA, AUC = NA)
for (i in 1:10){
dhhd(df)
mat[i,] <- cbind(idx_threshold, itpr, ifpr, icost, FeatureSec, iauc)
print(FeatureSec)
i = i+1
}


## TPR: 0.746987951807229
## AUC: 0.83744501816791
## FPR: 0.142857142857143
## Cost: 60
## [1] "sex pclass FsizeD fare embarked Fsize"


## TPR: 0.694736842105263
## AUC: 0.796999076638966
## FPR: 0.149122807017544
## Cost: 75
## [1] "sex pclass FsizeD fare Fsize parch"


## TPR: 0.734939759036145
## AUC: 0.804886211512718
## FPR: 0.19047619047619
## Cost: 68
## [1] "sex pclass FsizeD fare Fsize embarked"


## TPR: 0.771084337349398
## AUC: 0.828647925033467
## FPR: 0.19047619047619
## Cost: 62
## [1] "sex pclass fare FsizeD embarked age"


## TPR: 0.691358024691358
## AUC: 0.798466435185185
## FPR: 0.171875
## Cost: 72
## [1] "sex pclass fare FsizeD Fsize embarked"


## TPR: 0.853333333333333
## AUC: 0.841990049751244
## FPR: 0.298507462686567
## Cost: 62
## [1] "sex pclass fare FsizeD Fsize embarked"


## TPR: 0.753246753246753
## AUC: 0.838547815820543
## FPR: 0.159090909090909
## Cost: 59
## [1] "sex pclass fare FsizeD Fsize parch"


## TPR: 0.862068965517241
## AUC: 0.84887883926889
## FPR: 0.278688524590164
## Cost: 58
## [1] "sex pclass fare FsizeD Fsize parch"


## TPR: 0.793478260869565
## AUC: 0.772528799702713
## FPR: 0.401709401709402
## Cost: 85
## [1] "sex pclass FsizeD fare Fsize parch"


## TPR: 0.789473684210526
## AUC: 0.857439651760981
## FPR: 0.195488721804511
## Cost: 58
## [1] "sex pclass fare FsizeD Fsize parch"
##Question 2
splitting_and_testing <- function(df)
{
data = splitdf(df,percentage=0.6)
temp=splitdf(data$testset,percentage= 0.5)
test = temp$testset
training = data$trainset
threshold = temp$trainset
glmmodel = autoCut(df)
probs <- predict(glmmodel, type="response", newdata = training)
predictions <- data.frame(survived=training$survived, pred=probs)
roc <- data.frame(threshold = (1:99)/100, tpr=NA, fpr=NA)
roc$tpr <- sapply(roc$threshold, function(th) tpr(predictions, th))
roc$fpr <- sapply(roc$threshold, function(th) fpr(predictions, th))
roc$cost <- sapply(roc$threshold, function(th) cost(predictions, th))
idx_threshold = which.min(abs(roc$cost))
idx_threshold = idx_threshold/100
probs2 <- predict(glmmodel, type="response", newdata = test)
predictions2 <- data.frame(survived=test$survived, pred=probs2)
randfor<<- random.forest.importance(survived~., df)
FeatureSec1 <<- cutoff.k(randfor, 5)
FeatureSec1 <<- paste(FeatureSec1, sep = " ", collapse=" ")
itpr1 <<- tpr(predictions, idx_threshold)
ifpr1 <<- fpr(predictions, idx_threshold)
icost1 <<- cost(predictions, idx_threshold)
{
rbPal <- colorRampPalette(c('green','red'))
roc$col <- rbPal(10)[as.numeric(cut(roc$cost, breaks = 10))]
par(mfrow=c(1,2))
# This one is for the ROC (FPR vs. TPR)
plot(roc$fpr, roc$tpr, xlab="False Positive Rate", ylab="True Positive Rate",
type="b", cex=1.5, lwd=2, col=adjustcolor(roc$col, alpha=0.5), pch=16, xlim=c(0,1), ylim=c(0,1))
abline(0,1,lty=2, lwd=2)
abline(v=roc$fpr[idx_threshold], lty=2, lwd=1, col="grey");
abline(h=roc$tpr[idx_threshold], lty=2, lwd=1, col="grey")
# This one is for the cost.
plot(roc$threshold, roc$cost, xlab="Threshold", ylab="Cost",
type="b", cex=1.5, lwd=2, pch=16,xlim=c(0,1), col=adjustcolor(roc$col, alpha=0.5))
abline(v=roc$threshold[idx_threshold], lty=2, lwd=2, col="grey")
}
confusionMatrix(predictions2,idx_threshold)
cat(paste("TPR: ",tpr(predictions2, idx_threshold),"\n"))
cat(paste("FPR: ",fpr(predictions2, idx_threshold),"\n"))
cat(paste("Cost: ",cost(predictions2, idx_threshold, 1, 2),"\n"))
}
matcv <- data.frame(threshold = NA, tpr = NA, fpr = NA, cost = NA, Feature = NA)
for (i in 1:10){
splitting_and_testing(df)
matcv[i,] <- cbind(idx_threshold, itpr1, ifpr1, icost1, FeatureSec1)
print(FeatureSec)
i = i+1
}

## TPR: 0.729411764705882
## FPR: 0.17741935483871
## Cost: 68
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR: 0.815217391304348
## FPR: 0.290598290598291
## Cost: 68
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR: 0.644736842105263
## FPR: 0.24812030075188
## Cost: 87
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR: 0.865853658536585
## FPR: 0.173228346456693
## Cost: 44
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR: 0.722222222222222
## FPR: 0.218487394957983
## Cost: 76
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR: 0.893617021276596
## FPR: 0.234782608695652
## Cost: 47
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR: 0.69620253164557
## FPR: 0.161538461538462
## Cost: 69
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR: 0.855555555555556
## FPR: 0.369747899159664
## Cost: 70
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR: 0.808988764044944
## FPR: 0.133333333333333
## Cost: 50
## [1] "sex pclass fare FsizeD Fsize parch"

## TPR: 0.771739130434783
## FPR: 0.128205128205128
## Cost: 57
## [1] "sex pclass fare FsizeD Fsize parch"
View(matcv)
train_control <- trainControl(method="cv", number=10)
grid <- expand.grid(.fL=c(0), .usekernel=c(FALSE))
model <- train(survived~pclass+sex+age+sibsp+fare+embarked+parch, data=df, family = "binomial", trControl = trainControl(method = "cv", number = 10, verboseIter = TRUE))
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## + Fold01: mtry= 2
## - Fold01: mtry= 2
## + Fold01: mtry= 6
## - Fold01: mtry= 6
## + Fold01: mtry=11
## - Fold01: mtry=11
## + Fold02: mtry= 2
## - Fold02: mtry= 2
## + Fold02: mtry= 6
## - Fold02: mtry= 6
## + Fold02: mtry=11
## - Fold02: mtry=11
## + Fold03: mtry= 2
## - Fold03: mtry= 2
## + Fold03: mtry= 6
## - Fold03: mtry= 6
## + Fold03: mtry=11
## - Fold03: mtry=11
## + Fold04: mtry= 2
## - Fold04: mtry= 2
## + Fold04: mtry= 6
## - Fold04: mtry= 6
## + Fold04: mtry=11
## - Fold04: mtry=11
## + Fold05: mtry= 2
## - Fold05: mtry= 2
## + Fold05: mtry= 6
## - Fold05: mtry= 6
## + Fold05: mtry=11
## - Fold05: mtry=11
## + Fold06: mtry= 2
## - Fold06: mtry= 2
## + Fold06: mtry= 6
## - Fold06: mtry= 6
## + Fold06: mtry=11
## - Fold06: mtry=11
## + Fold07: mtry= 2
## - Fold07: mtry= 2
## + Fold07: mtry= 6
## - Fold07: mtry= 6
## + Fold07: mtry=11
## - Fold07: mtry=11
## + Fold08: mtry= 2
## - Fold08: mtry= 2
## + Fold08: mtry= 6
## - Fold08: mtry= 6
## + Fold08: mtry=11
## - Fold08: mtry=11
## + Fold09: mtry= 2
## - Fold09: mtry= 2
## + Fold09: mtry= 6
## - Fold09: mtry= 6
## + Fold09: mtry=11
## - Fold09: mtry=11
## + Fold10: mtry= 2
## - Fold10: mtry= 2
## + Fold10: mtry= 6
## - Fold10: mtry= 6
## + Fold10: mtry=11
## - Fold10: mtry=11
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
print(model)
## Random Forest
##
## 1046 samples
## 7 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 941, 941, 941, 942, 942, 942, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.7876648 0.5413655
## 6 0.7857692 0.5471661
## 11 0.7847985 0.5469944
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.