EVALUATE THE MODEL: using AUC (area under the ROC curve) for binary classification models on the validation data.
#Import the data
training.raw <- read.csv('AT2_credit_train_STUDENT.csv', header = TRUE)
predict.raw <- read.csv('AT2_credit_test_STUDENT.csv', header = TRUE)
#need to change ID to be "character", so the IDs are not counted as numbers
clean_data <- function(dataSet) {
#clean up sex, call it new_sex and change the animal classifications to 0
output <- dataSet
output$ID <- as.character(output$ID)
output$SEX <- as.integer(output$SEX)
output$SEX[output$SEX > 2] <- 0
output$EDUCATION[output$EDUCATION > 4] <- 4
output$EDUCATION[output$EDUCATION == 0] <- 4
#clean age to remove aged over 100 entries will become NA
output$AGE <- ifelse(output$AGE >=100, NA, output$AGE)
#changing the NAs to now become the median age, which is rounded to 35
output$AGE[is.na(output$AGE)] <- round(mean(output$AGE[!is.na(output$AGE)]))
#removed these factors from modelling to make them integers
output$SEX <- as.factor(output$SEX)
output$EDUCATION <- as.factor(output$EDUCATION)
output$MARRIAGE <- as.factor(output$MARRIAGE)
return(output)
}
Create some standard functions for measuring evaulation criteria
# We'll want to look at evaluation measures regularly, so create a function to calculate and return them
get_evaluation_measures <- function(name = NA, tn, fp, fn, tp) {
accuracy = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
F1 = 2 * ((precision * recall)/(precision + recall))
output = data.frame(name, accuracy, precision, recall, F1)
return(output)
}
#Now to get AUC. We'll do it again further on in our analysis, so write a function
get_auc <- function(probabilities, targets) {
probs = as.vector(probabilities)
pred = prediction(probs,targets)
perf_AUC = performance(pred, "auc")
AUC = perf_AUC@y.values[[1]]
return(AUC)
}
Clean our training data
training.clean <- clean_data(training.raw)
training.clean = training.clean[complete.cases(training.clean), ]
Get train and test sets
trainset.size <- floor(0.80 * nrow(training.clean))
target.col <- grep("default",names(training.clean))
#set random seed
set.seed(42)
trainset.indices <- sample(seq_len(nrow(training.clean[,-1])), size = trainset.size)
#assign observations to training and testing sets
train.set <- training.clean[trainset.indices, ]
test.set <- training.clean[-trainset.indices, ]
sampsize = rep(sum(train.set$default == "Y"), 2)
#Build random forest model
rf.model <- randomForest(default ~ .,
data = train.set,
importance = TRUE,
xtest = test.set[,-target.col],
keep.forest = TRUE,
sampsize = sampsize,
ntree = 1000)
rf.prob <- predict(rf.model, test.set[,-target.col])
#confusion matrix
rf.cm <- table(pred = rf.prob, true = test.set[,target.col])
rf.cm.df <- as.data.frame(rf.cm)
rf.eval <- get_evaluation_measures("Random Forest",
rf.cm.df$Freq[1],
rf.cm.df$Freq[2],
rf.cm.df$Freq[3],
rf.cm.df$Freq[4])
#Get the AUC and add it to our evaluation measures data frame
rf.eval$AUC <- get_auc(rf.model$test$votes[,2], test.set[,target.col])
rf.eval
## name accuracy precision recall F1 AUC
## 1 Random Forest 0.8013417 0.5639932 0.6189139 0.5901786 0.8096614
#Load the vaidation set
testset <- read.csv("AT2_credit_test_STUDENT.csv")
#Train set is the full training set after cleaning for the tree models
final.trainset <- training.clean
#Test set
final.testset <- clean_data(testset)
final.sampsize = rep(sum(final.trainset$default == "Y"), 2)
#Run the model (don't include ID, age or gender columns in the testset)
validation.model <- randomForest(default~.,
data = final.trainset[,-1],
importance = TRUE,
xtest = final.testset[,-1],
sampsize = final.sampsize,
ntree = 1000)
#predictions for test set
final.testset$default <- validation.model$test$predicted
final_output <- final.testset %>%
select(ID, default)
final_output$default <- ifelse(final_output$default == "Y", 1, 0)
write.csv(final_output, file="predictions/rf_results.csv", row.names=FALSE)