DATA 607, Spring 2024
Chhiring Lama
2024-04-03
knitr::opts_chunk$set(echo = FALSE)
## ## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats': ## ## filter, lag
## The following objects are masked from 'package:base': ## ## intersect, setdiff, setequal, union
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## ## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2': ## ## margin
## The following object is masked from 'package:dplyr': ## ## combine
## Warning: package 'rpart' was built under R version 4.3.3
## Warning: package 'rpart.plot' was built under R version 4.3.3
DATA 607, Spring 2024
Chhiring Lama
loan_url <- "https://media.githubusercontent.com/media/topkelama/lfsStorage/main/loan_eligibility.csv" loana_df<- read.csv(loan_url)
set.seed(1234) # Set a seed for reproducibility shuffled_loan <- loana_df[sample(nrow(loana_df)), ]
# Drop variables
clean_loan <- shuffled_loan %>%
select(-c(Loan.ID, Customer.ID)) %>%
# Convert to factor level
mutate(Term = factor(Term),
Years.in.current.job = factor(Years.in.current.job),
Home.Ownership = factor(Home.Ownership),
Purpose = factor(Purpose),
Bankruptcies = factor(Bankruptcies),
Tax.Liens = factor(Tax.Liens)) %>%
na.omit()
clean_loan <- clean_loan %>% mutate(Loan_Status = ifelse(Current.Loan.Amount < 500000, "Approved", "Denied"))
create_train_test <- function(data, size = 0.8, train = TRUE) {
n_row <- nrow(data)
total_row <- round(size * n_row)
train_sample <- sample(n_row, total_row)
if (train == TRUE) {
return(data[train_sample, ])
} else {
test_sample <- sample(setdiff(1:n_row, train_sample), n_row - total_row)
return(data[test_sample, ])
}
}
train_set <- create_train_test(clean_loan, size = 0.8, train = TRUE) dim(train_set)
## [1] 598 17
test_set <- create_train_test(clean_loan, size = 0.8, train = FALSE) dim(test_set)
## [1] 150 17
formula <- Loan_Status ~ Credit.Score + Years.in.current.job + Home.Ownership
decision_tree <- rpart(formula, data = train_set, method = "class")
printcp(decision_tree)
## ## Classification tree: ## rpart(formula = formula, data = train_set, method = "class") ## ## Variables actually used in tree construction: ## [1] Credit.Score Years.in.current.job ## ## Root node error: 167/598 = 0.27926 ## ## n= 598 ## ## CP nsplit rel error xerror xstd ## 1 0.013972 0 1.00000 1.0000 0.065695 ## 2 0.010000 3 0.95808 1.0539 0.066734
# Plot the decision tree rpart.plot(decision_tree, uniform = TRUE, main = "Decision Tree Model")
And Calculate accuracy
predictions <- predict(decision_tree, test_set, type = "class")
accuracy <- mean(predictions == test_set$Loan_Status)
print(paste("Accuracy of the decision tree model:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy of the decision tree model: 74.67 %"
Personal loan modeling. (2020, March 10). Kaggle. https://www.kaggle.com/datasets/teertha/personal-loan-modeling
Memberdev_Admin. (2023, March 31). Om Swami Official Writings & Courses. os.me - a Spiritual Home. https://os.me/short-stories/the-upside-down-tree/