Start modelling

EVALUATE THE MODEL: using AUC (area under the ROC curve) for binary classification models on the validation data.

#Import the data
training.raw <- read.csv('AT2_credit_train_STUDENT.csv', header = TRUE)
predict.raw <- read.csv('AT2_credit_test_STUDENT.csv', header = TRUE)

Clean the data based on observations

#need to change ID to be "character", so the IDs are not counted as numbers

clean_data <- function(dataSet) {
  
  #clean up sex, call it new_sex and change the animal classifications to 0
  output <- dataSet
  
  output$ID <- as.character(output$ID)
  
  output$SEX <- as.integer(output$SEX)
  
  output$SEX[output$SEX > 2] <- 0
  
  output$EDUCATION[output$EDUCATION > 4] <- 4
  output$EDUCATION[output$EDUCATION == 0] <- 4
  
  #clean age to remove aged over 100 entries will become NA
  output$AGE <- ifelse(output$AGE >=100, NA, output$AGE)
  #changing the NAs to now become the median age, which is rounded to 35
  output$AGE[is.na(output$AGE)] <- round(mean(output$AGE[!is.na(output$AGE)]))
  #removed these factors from modelling to make them integers
  output$SEX <- as.factor(output$SEX)
  output$EDUCATION <- as.factor(output$EDUCATION)
  output$MARRIAGE <- as.factor(output$MARRIAGE)

  return(output)
  
}

Create some standard functions for measuring evaulation criteria

# We'll want to look at evaluation measures regularly, so create a function to calculate and return them
get_evaluation_measures <- function(name = NA, tn, fp, fn, tp) {
  
  accuracy = (tp+tn)/(tp+tn+fp+fn)
  
  precision = tp/(tp+fp)
  
  recall = tp/(tp+fn)
  
  F1 = 2 * ((precision * recall)/(precision + recall))
  
  output = data.frame(name, accuracy, precision, recall, F1)
  
  return(output)
  
}
#Now to get AUC. We'll do it again further on in our analysis, so write a function
get_auc <- function(probabilities, targets) {
  
  probs = as.vector(probabilities)
  
  pred = prediction(probs,targets)
  
  perf_AUC = performance(pred, "auc")
  
  AUC = perf_AUC@y.values[[1]]
  
  return(AUC)
  
}

Clean our training data

training.clean <- clean_data(training.raw)

training.clean = training.clean[complete.cases(training.clean), ]

Get train and test sets

trainset.size <- floor(0.80 * nrow(training.clean))

target.col <- grep("default",names(training.clean))

#set random seed 
set.seed(42)

trainset.indices <- sample(seq_len(nrow(training.clean[,-1])), size = trainset.size)

#assign observations to training and testing sets
train.set <- training.clean[trainset.indices, ]
test.set <- training.clean[-trainset.indices, ]

sampsize = rep(sum(train.set$default == "Y"), 2)

Try a random forest model

#Build random forest model
rf.model <- randomForest(default ~ .,
                         data = train.set,
                         importance = TRUE,
                         xtest = test.set[,-target.col],
                         keep.forest = TRUE,
                         sampsize = sampsize,
                         ntree = 1000)

rf.prob <- predict(rf.model, test.set[,-target.col])

#confusion matrix
rf.cm <- table(pred = rf.prob, true = test.set[,target.col])
rf.cm.df <- as.data.frame(rf.cm)

rf.eval <- get_evaluation_measures("Random Forest",
                                   rf.cm.df$Freq[1],
                                   rf.cm.df$Freq[2],
                                   rf.cm.df$Freq[3],
                                   rf.cm.df$Freq[4])

#Get the AUC and add it to our evaluation measures data frame
rf.eval$AUC <- get_auc(rf.model$test$votes[,2], test.set[,target.col])

rf.eval
##            name  accuracy precision    recall        F1       AUC
## 1 Random Forest 0.8013417 0.5639932 0.6189139 0.5901786 0.8096614

4. Now use the Random Forest model to predict for the validation set

#Load the vaidation set
testset <- read.csv("AT2_credit_test_STUDENT.csv")

#Train set is the full training set after cleaning for the tree models
final.trainset <- training.clean

#Test set
final.testset <- clean_data(testset)

final.sampsize = rep(sum(final.trainset$default == "Y"), 2)

#Run the model (don't include ID, age or gender columns in the testset)
validation.model <- randomForest(default~.,
                                 data = final.trainset[,-1], 
                                 importance = TRUE, 
                                 xtest = final.testset[,-1],
                                 sampsize = final.sampsize,
                                 ntree = 1000)

#predictions for test set
final.testset$default <- validation.model$test$predicted

final_output <- final.testset %>%
    select(ID, default)

final_output$default <- ifelse(final_output$default == "Y", 1, 0)

write.csv(final_output, file="predictions/rf_results.csv", row.names=FALSE)