link to the dataset:https://www.kaggle.com/datasets/itssuru/loan-data
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)## Warning: package 'ggplot2' was built under R version 4.2.2
library(GGally)## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
loan <- read.csv("loan_data.csv", stringsAsFactors = T)
head(loan)table(loan$not.fully.paid)##
## 0 1
## 8045 1533
The data seems imbalance, we will try both imbalance data and balanced data by using downsampling.
anyNA(loan)## [1] FALSE
sum(duplicated(loan))## [1] 0
The data doesn’t have any missing value (NA) nor duplicated data.
Changing data type fon target column to factor
loan <- loan %>% mutate(not.fully.paid = as.factor(not.fully.paid))Deviding data into train and test set
set.seed(100)
insample <- sample(nrow(loan), nrow(loan) * 0.8)
train_loan <- loan[insample, ]
test_loan <- loan[-insample, ]prop.table(table(train_loan$not.fully.paid))##
## 0 1
## 0.839598 0.160402
prop.table(table(test_loan$not.fully.paid))##
## 0 1
## 0.8413361 0.1586639
table(train_loan$not.fully.paid)##
## 0 1
## 6433 1229
table(test_loan$not.fully.paid)##
## 0 1
## 1612 304
Model using all columns as predictor
risk_model_all <- glm(formula = not.fully.paid ~ ., data = train_loan, family = "binomial")
summary(risk_model_all)##
## Call:
## glm(formula = not.fully.paid ~ ., family = "binomial", data = train_loan)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3842 -0.6237 -0.4977 -0.3581 2.5250
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 9.026e+00 1.451e+00 6.221 4.95e-10 ***
## credit.policy -2.913e-01 9.384e-02 -3.104 0.001907 **
## purposecredit_card -6.398e-01 1.235e-01 -5.183 2.19e-07 ***
## purposedebt_consolidation -3.755e-01 8.577e-02 -4.378 1.20e-05 ***
## purposeeducational -5.298e-03 1.744e-01 -0.030 0.975768
## purposehome_improvement -5.285e-02 1.436e-01 -0.368 0.712945
## purposemajor_purchase -3.555e-01 1.809e-01 -1.966 0.049354 *
## purposesmall_business 4.340e-01 1.306e-01 3.324 0.000889 ***
## int.rate 2.031e+00 1.931e+00 1.052 0.292879
## installment 1.184e-03 1.950e-04 6.073 1.26e-09 ***
## log.annual.inc -4.091e-01 6.736e-02 -6.073 1.26e-09 ***
## dti 1.857e-03 5.090e-03 0.365 0.715186
## fico -9.690e-03 1.596e-03 -6.072 1.27e-09 ***
## days.with.cr.line 1.081e-05 1.514e-05 0.714 0.475483
## revol.bal 3.322e-06 1.051e-06 3.160 0.001577 **
## revol.util 2.459e-03 1.425e-03 1.726 0.084390 .
## inq.last.6mths 8.049e-02 1.534e-02 5.248 1.54e-07 ***
## delinq.2yrs -1.148e-01 6.190e-02 -1.855 0.063644 .
## pub.rec 2.179e-01 1.110e-01 1.962 0.049732 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6747.7 on 7661 degrees of freedom
## Residual deviance: 6285.9 on 7643 degrees of freedom
## AIC: 6323.9
##
## Number of Fisher Scoring iterations: 5
Doing backward step, to select columns with good correlation only
risk_model_backward <- step(object=risk_model_all,
direction="backward",
trace = F)
summary(risk_model_backward)##
## Call:
## glm(formula = not.fully.paid ~ credit.policy + purpose + installment +
## log.annual.inc + fico + revol.bal + revol.util + inq.last.6mths +
## delinq.2yrs + pub.rec, family = "binomial", data = train_loan)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4227 -0.6236 -0.4990 -0.3581 2.5455
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 9.779e+00 1.075e+00 9.098 < 2e-16 ***
## credit.policy -2.918e-01 9.372e-02 -3.114 0.001847 **
## purposecredit_card -6.406e-01 1.228e-01 -5.218 1.81e-07 ***
## purposedebt_consolidation -3.724e-01 8.502e-02 -4.380 1.19e-05 ***
## purposeeducational -3.180e-03 1.744e-01 -0.018 0.985453
## purposehome_improvement -4.548e-02 1.435e-01 -0.317 0.751294
## purposemajor_purchase -3.539e-01 1.808e-01 -1.957 0.050333 .
## purposesmall_business 4.657e-01 1.269e-01 3.670 0.000242 ***
## installment 1.267e-03 1.790e-04 7.075 1.49e-12 ***
## log.annual.inc -4.016e-01 6.540e-02 -6.141 8.21e-10 ***
## fico -1.049e-02 1.232e-03 -8.515 < 2e-16 ***
## revol.bal 3.488e-06 1.031e-06 3.384 0.000715 ***
## revol.util 2.861e-03 1.389e-03 2.059 0.039463 *
## inq.last.6mths 8.217e-02 1.534e-02 5.358 8.42e-08 ***
## delinq.2yrs -1.061e-01 6.048e-02 -1.755 0.079320 .
## pub.rec 2.272e-01 1.099e-01 2.067 0.038700 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6747.7 on 7661 degrees of freedom
## Residual deviance: 6287.8 on 7646 degrees of freedom
## AIC: 6319.8
##
## Number of Fisher Scoring iterations: 5
prediction_prop <- predict(risk_model_backward, newdata = test_loan, type = "response")
threshold <- 0.15
prediction_prop <- ifelse(prediction_prop > threshold, 1, 0)library(caret)## Warning: package 'caret' was built under R version 4.2.2
## Loading required package: lattice
confusionMatrix(data = as.factor(prediction_prop),
reference = as.factor(test_loan$not.fully.paid),
positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 967 109
## 1 645 195
##
## Accuracy : 0.6065
## 95% CI : (0.5842, 0.6284)
## No Information Rate : 0.8413
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1407
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6414
## Specificity : 0.5999
## Pos Pred Value : 0.2321
## Neg Pred Value : 0.8987
## Prevalence : 0.1587
## Detection Rate : 0.1018
## Detection Prevalence : 0.4384
## Balanced Accuracy : 0.6207
##
## 'Positive' Class : 1
##
From this model, the metrics that are proposed are either accuracy or th sensitivity. In my point of view, it depends on the company’s strategy. If the company wants to be more careful for their money, they can increase the sensitivity by decreasing the threshold. This will prevent from lending money to the customer that will fail on the paying back. The drawbacks of this strategy is the false negative rate will also increase. This means that will be more people that predicted they cannot pay but actually they can fully pay back in the future. This also means there will be less people taking loan than it should, and this will lead to less income to the company.
let’s try to make another model that might have not only good sensitivity but also good accuracy.
head(loan)length(unique(loan$pub.rec))## [1] 6
let’s select only the numeric data because KNN cannot work with categorical data.
loan_knn <- loan %>%
select(-credit.policy, -purpose)set.seed(100)
insample <- sample(nrow(loan_knn), nrow(loan_knn) * 0.8)
train_loank <- loan_knn[insample, ]
test_loank <- loan_knn[-insample, ]head(train_loank)Because the data between the columns are not havig the equal metrics, we need to normalize the data.
normalize <- function(x){
return (
(x - min(x))/(max(x) - min(x))
)
}train_normal <- data.frame(lapply(train_loank[, -12], normalize))
test_normal <- data.frame(lapply(test_loank[, -12], normalize))
head(train_normal)head(test_normal)One of the method to choose the value of K is by setting the K value with square root of the total rows
k <- sqrt(nrow(train_normal))
# 87.5
k <- 87library(class)
pred_knn <- knn(train = train_normal,
test = test_normal,
cl = train_loank$not.fully.paid,
k = 40)confusionMatrix(data = pred_knn,
reference = as.factor(test_loank$not.fully.paid),
positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1611 304
## 1 1 0
##
## Accuracy : 0.8408
## 95% CI : (0.8237, 0.8569)
## No Information Rate : 0.8413
## P-Value [Acc > NIR] : 0.5402
##
## Kappa : -0.001
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000000
## Specificity : 0.9993797
## Pos Pred Value : 0.0000000
## Neg Pred Value : 0.8412533
## Prevalence : 0.1586639
## Detection Rate : 0.0000000
## Detection Prevalence : 0.0005219
## Balanced Accuracy : 0.4996898
##
## 'Positive' Class : 1
##
The KNN has good accuracy but very bad sensitivity. Adjusting the K value didn’t give any improvement to the model. So the logistic regression has the better model than the KNN. But we know that the logistic regression model isn’t really good to do the prediction. There is still one way to improve the model. It is by balancing the data between the positive and negative class.
But before that, let’s try to measure the logistic regression model performance using the other methods (ROC & AUC)
library(ROCR)## Warning: package 'ROCR' was built under R version 4.2.2
pred_prob <- predict(risk_model_backward, newdata = test_loan, type = "response")
pred_prob <- data.frame(pred_prob)
roc_pred <- prediction(predictions = pred_prob[,1], labels = test_loan$not.fully.paid)
plot(performance(prediction.obj = roc_pred, measure = "tpr", x.measure = "fpr"))
abline(0,1, lty= 2)loan_auc <- performance(prediction.obj = roc_pred, measure = "auc")
loan_auc@y.values## [[1]]
## [1] 0.6745176
Area under curve that we get from the model prediction is 0.67. This validates that the model is still not good enough.
I choose to do down sampling because the least class having more than 1200 data, this should be enough.
RNGkind(sample.kind = "Rounding")## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(100)
library(caret)
loan_train_down <- downSample(x = train_loan %>% select(-not.fully.paid),
y = train_loan$not.fully.paid,
list = F,
yname = "not.fully.paid") #nama kolom target
#loan_train_down <- data.frame(loan_train_down)
head(loan_train_down)prop.table(table(loan_train_down$not.fully.paid))##
## 0 1
## 0.5 0.5
risk_model_all_new <- glm(formula = not.fully.paid ~ ., data = loan_train_down, family = "binomial")
summary(risk_model_all_new)##
## Call:
## glm(formula = not.fully.paid ~ ., family = "binomial", data = loan_train_down)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.658 -1.080 -0.167 1.101 1.873
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.021e+01 2.007e+00 5.086 3.66e-07 ***
## credit.policy -2.809e-01 1.342e-01 -2.093 0.036350 *
## purposecredit_card -6.017e-01 1.565e-01 -3.844 0.000121 ***
## purposedebt_consolidation -3.470e-01 1.145e-01 -3.032 0.002429 **
## purposeeducational -7.075e-02 2.376e-01 -0.298 0.765932
## purposehome_improvement 8.664e-02 1.956e-01 0.443 0.657796
## purposemajor_purchase -4.077e-01 2.308e-01 -1.767 0.077278 .
## purposesmall_business 4.416e-01 1.855e-01 2.381 0.017287 *
## int.rate 3.829e+00 2.672e+00 1.433 0.151911
## installment 1.216e-03 2.644e-04 4.600 4.23e-06 ***
## log.annual.inc -4.802e-01 8.922e-02 -5.382 7.36e-08 ***
## dti -1.136e-03 6.905e-03 -0.164 0.869359
## fico -8.336e-03 2.165e-03 -3.849 0.000118 ***
## days.with.cr.line -2.715e-06 2.071e-05 -0.131 0.895687
## revol.bal 4.089e-06 1.653e-06 2.473 0.013381 *
## revol.util 3.303e-03 1.897e-03 1.741 0.081669 .
## inq.last.6mths 1.051e-01 2.461e-02 4.272 1.93e-05 ***
## delinq.2yrs -3.627e-02 8.800e-02 -0.412 0.680181
## pub.rec 3.918e-01 1.645e-01 2.381 0.017253 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3407.5 on 2457 degrees of freedom
## Residual deviance: 3120.9 on 2439 degrees of freedom
## AIC: 3158.9
##
## Number of Fisher Scoring iterations: 4
risk_model_backward_new <- step(object=risk_model_all_new,
direction="backward",
trace = F)
summary(risk_model_backward_new)##
## Call:
## glm(formula = not.fully.paid ~ credit.policy + purpose + int.rate +
## installment + log.annual.inc + fico + revol.bal + revol.util +
## inq.last.6mths + pub.rec, family = "binomial", data = loan_train_down)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6528 -1.0783 -0.1676 1.1008 1.8747
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.013e+01 1.904e+00 5.321 1.03e-07 ***
## credit.policy -2.818e-01 1.339e-01 -2.104 0.035358 *
## purposecredit_card -6.056e-01 1.560e-01 -3.883 0.000103 ***
## purposedebt_consolidation -3.488e-01 1.135e-01 -3.072 0.002123 **
## purposeeducational -7.218e-02 2.374e-01 -0.304 0.761122
## purposehome_improvement 8.765e-02 1.953e-01 0.449 0.653510
## purposemajor_purchase -4.084e-01 2.306e-01 -1.771 0.076481 .
## purposesmall_business 4.424e-01 1.854e-01 2.386 0.017020 *
## int.rate 3.776e+00 2.669e+00 1.415 0.157083
## installment 1.215e-03 2.641e-04 4.600 4.22e-06 ***
## log.annual.inc -4.826e-01 8.626e-02 -5.595 2.21e-08 ***
## fico -8.222e-03 2.062e-03 -3.987 6.68e-05 ***
## revol.bal 4.013e-06 1.595e-06 2.516 0.011857 *
## revol.util 3.358e-03 1.826e-03 1.839 0.065879 .
## inq.last.6mths 1.053e-01 2.448e-02 4.302 1.69e-05 ***
## pub.rec 3.933e-01 1.631e-01 2.412 0.015875 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3407.5 on 2457 degrees of freedom
## Residual deviance: 3121.2 on 2442 degrees of freedom
## AIC: 3153.2
##
## Number of Fisher Scoring iterations: 4
prediction_prop_new <- predict(risk_model_backward_new, newdata = test_loan, type = "response")
threshold <- 0.3
prediction_prop_new <- ifelse(prediction_prop_new > threshold, 1, 0)library(caret)
confusionMatrix(data = as.factor(prediction_prop_new),
reference = as.factor(test_loan$not.fully.paid),
positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 313 23
## 1 1299 281
##
## Accuracy : 0.31
## 95% CI : (0.2894, 0.3313)
## No Information Rate : 0.8413
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0438
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9243
## Specificity : 0.1942
## Pos Pred Value : 0.1778
## Neg Pred Value : 0.9315
## Prevalence : 0.1587
## Detection Rate : 0.1467
## Detection Prevalence : 0.8246
## Balanced Accuracy : 0.5593
##
## 'Positive' Class : 1
##
pred_prob_new <- predict(risk_model_backward_new, newdata = test_loan, type = "response")
pred_prob_new <- data.frame(pred_prob_new)
roc_pred_new <- prediction(predictions = pred_prob_new[,1], labels = test_loan$not.fully.paid)
plot(performance(prediction.obj = roc_pred_new, measure = "tpr", x.measure = "fpr"))
abline(0,1, lty= 2)loan_auc_new <- performance(prediction.obj = roc_pred_new, measure = "auc")
loan_auc_new@y.values## [[1]]
## [1] 0.675795
The AUC after downsampling the data is 0.68. This is not any better from the previous model. Let’s try up sampling the data.
RNGkind(sample.kind = "Rounding")## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(100)
library(caret)
loan_train_up <- upSample(x = train_loan %>% select(-not.fully.paid),
y = train_loan$not.fully.paid,
list = F,
yname = "not.fully.paid") #nama kolom target
#loan_train_down <- data.frame(loan_train_down)
head(loan_train_up)table(loan_train_up$not.fully.paid)##
## 0 1
## 6433 6433
risk_model_all_new1 <- glm(formula = not.fully.paid ~ ., data = loan_train_up, family = "binomial")
summary(risk_model_all_new1)##
## Call:
## glm(formula = not.fully.paid ~ ., family = "binomial", data = loan_train_up)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5768 -1.0815 -0.1357 1.0942 1.8654
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.057e+01 8.601e-01 12.284 < 2e-16 ***
## credit.policy -1.769e-01 5.834e-02 -3.033 0.002424 **
## purposecredit_card -6.288e-01 6.809e-02 -9.236 < 2e-16 ***
## purposedebt_consolidation -3.914e-01 4.980e-02 -7.860 3.85e-15 ***
## purposeeducational -8.997e-02 1.053e-01 -0.854 0.392906
## purposehome_improvement -5.522e-02 8.368e-02 -0.660 0.509364
## purposemajor_purchase -3.966e-01 1.024e-01 -3.872 0.000108 ***
## purposesmall_business 3.666e-01 8.150e-02 4.498 6.86e-06 ***
## int.rate 3.327e+00 1.166e+00 2.854 0.004317 **
## installment 1.236e-03 1.145e-04 10.801 < 2e-16 ***
## log.annual.inc -4.664e-01 3.888e-02 -11.996 < 2e-16 ***
## dti -2.469e-03 2.980e-03 -0.829 0.407283
## fico -9.064e-03 9.307e-04 -9.740 < 2e-16 ***
## days.with.cr.line 2.980e-06 9.039e-06 0.330 0.741641
## revol.bal 5.489e-06 7.469e-07 7.349 1.99e-13 ***
## revol.util 3.140e-03 8.265e-04 3.799 0.000145 ***
## inq.last.6mths 1.095e-01 1.067e-02 10.262 < 2e-16 ***
## delinq.2yrs -6.624e-02 3.527e-02 -1.878 0.060349 .
## pub.rec 2.885e-01 7.078e-02 4.075 4.59e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 17836 on 12865 degrees of freedom
## Residual deviance: 16380 on 12847 degrees of freedom
## AIC: 16418
##
## Number of Fisher Scoring iterations: 4
risk_model_backward_new1 <- step(object=risk_model_all_new1,
direction="backward",
trace = F)
summary(risk_model_backward_new1)##
## Call:
## glm(formula = not.fully.paid ~ credit.policy + purpose + int.rate +
## installment + log.annual.inc + fico + revol.bal + revol.util +
## inq.last.6mths + delinq.2yrs + pub.rec, family = "binomial",
## data = loan_train_up)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5576 -1.0826 -0.1351 1.0976 1.8577
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.042e+01 8.247e-01 12.640 < 2e-16 ***
## credit.policy -1.761e-01 5.816e-02 -3.028 0.002461 **
## purposecredit_card -6.332e-01 6.772e-02 -9.351 < 2e-16 ***
## purposedebt_consolidation -3.959e-01 4.937e-02 -8.020 1.06e-15 ***
## purposeeducational -9.087e-02 1.053e-01 -0.863 0.388071
## purposehome_improvement -5.349e-02 8.359e-02 -0.640 0.522270
## purposemajor_purchase -3.961e-01 1.024e-01 -3.868 0.000110 ***
## purposesmall_business 3.684e-01 8.146e-02 4.522 6.11e-06 ***
## int.rate 3.290e+00 1.164e+00 2.827 0.004695 **
## installment 1.236e-03 1.144e-04 10.804 < 2e-16 ***
## log.annual.inc -4.600e-01 3.772e-02 -12.193 < 2e-16 ***
## fico -8.973e-03 9.024e-04 -9.943 < 2e-16 ***
## revol.bal 5.414e-06 7.230e-07 7.489 6.96e-14 ***
## revol.util 3.060e-03 8.113e-04 3.772 0.000162 ***
## inq.last.6mths 1.094e-01 1.063e-02 10.286 < 2e-16 ***
## delinq.2yrs -6.325e-02 3.468e-02 -1.824 0.068198 .
## pub.rec 2.913e-01 7.031e-02 4.143 3.43e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 17836 on 12865 degrees of freedom
## Residual deviance: 16380 on 12849 degrees of freedom
## AIC: 16414
##
## Number of Fisher Scoring iterations: 4
prediction_prop_new1 <- predict(risk_model_backward_new1, newdata = test_loan, type = "response")
threshold <- 0.4
prediction_prop_new1 <- ifelse(prediction_prop_new1 > threshold, 1, 0)library(caret)
confusionMatrix(data = as.factor(prediction_prop_new1),
reference = as.factor(test_loan$not.fully.paid),
positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 657 63
## 1 955 241
##
## Accuracy : 0.4687
## 95% CI : (0.4461, 0.4913)
## No Information Rate : 0.8413
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0915
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7928
## Specificity : 0.4076
## Pos Pred Value : 0.2015
## Neg Pred Value : 0.9125
## Prevalence : 0.1587
## Detection Rate : 0.1258
## Detection Prevalence : 0.6242
## Balanced Accuracy : 0.6002
##
## 'Positive' Class : 1
##
pred_prob_new1 <- predict(risk_model_backward_new1, newdata = test_loan, type = "response")
pred_prob_new1 <- data.frame(pred_prob_new1)
roc_pred_new1 <- prediction(predictions = pred_prob_new1[,1], labels = test_loan$not.fully.paid)
plot(performance(prediction.obj = roc_pred_new1, measure = "tpr", x.measure = "fpr"))
abline(0,1, lty= 2)loan_auc_new1 <- performance(prediction.obj = roc_pred_new1, measure = "auc")
loan_auc_new1@y.values## [[1]]
## [1] 0.6731157
The AUC that we got is 0.69. This is just almost the same from the previous model. I think we need to use a better algorithm to tackle this problem. We can use either naive bayes, decision tree, or even random forest.