library(tidyverse)
library(caTools)
library(ROCR)
library(rpart)
library(rmdformats)
library(randomForest)
library(psych)## load data
my_loan_data<- read.csv("https://raw.githubusercontent.com/yinaS1234/data-606/main/606%20final%20project/loan_data.csv")
head(my_loan_data)## Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome
## 1 LP001002 Male No 0 Graduate No 5849
## 2 LP001003 Male Yes 1 Graduate No 4583
## 3 LP001005 Male Yes 0 Graduate Yes 3000
## 4 LP001006 Male Yes 0 Not Graduate No 2583
## 5 LP001008 Male No 0 Graduate No 6000
## 6 LP001011 Male Yes 2 Graduate Yes 5417
## CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area
## 1 0 NA 360 1 Urban
## 2 1508 128 360 1 Rural
## 3 0 66 360 1 Urban
## 4 2358 120 360 1 Urban
## 5 0 141 360 1 Urban
## 6 4196 267 360 1 Urban
## Loan_Status
## 1 Y
## 2 N
## 3 Y
## 4 Y
## 5 Y
## 6 Y
dim(my_loan_data)## [1] 614 13
# Store backup before removing missing values
my_loan_data_backup <- my_loan_data
# Return all rows with missing values
my_loan_data[!complete.cases(my_loan_data),]## Loan_ID Gender Married Dependents Education Self_Employed
## 1 LP001002 Male No 0 Graduate No
## 17 LP001034 Male No 1 Not Graduate No
## 20 LP001041 Male Yes 0 Graduate
## 25 LP001052 Male Yes 1 Graduate
## 31 LP001091 Male Yes 1 Graduate
## 36 LP001106 Male Yes 0 Graduate No
## 37 LP001109 Male Yes 0 Graduate No
## 43 LP001123 Male Yes 0 Graduate No
## 45 LP001136 Male Yes 0 Not Graduate Yes
## 46 LP001137 Female No 0 Graduate No
## 64 LP001213 Male Yes 1 Graduate No
## 74 LP001250 Male Yes 3+ Not Graduate No
## 80 LP001264 Male Yes 3+ Not Graduate Yes
## 82 LP001266 Male Yes 1 Graduate Yes
## 84 LP001273 Male Yes 0 Graduate No
## 87 LP001280 Male Yes 2 Not Graduate No
## 96 LP001326 Male No 0 Graduate
## 103 LP001350 Male Yes Graduate No
## 104 LP001356 Male Yes 0 Graduate No
## 113 LP001391 Male Yes 0 Not Graduate No
## 114 LP001392 Female No 1 Graduate Yes
## 118 LP001405 Male Yes 1 Graduate No
## 126 LP001443 Female No 0 Graduate No
## 128 LP001449 Male No 0 Graduate No
## 130 LP001465 Male Yes 0 Graduate No
## 131 LP001469 Male No 0 Graduate Yes
## 157 LP001541 Male Yes 1 Graduate No
## 166 LP001574 Male Yes 0 Graduate No
## 182 LP001634 Male No 0 Graduate No
## 188 LP001643 Male Yes 0 Graduate No
## 198 LP001669 Female No 0 Not Graduate No
## 199 LP001671 Female Yes 0 Graduate No
## 203 LP001682 Male Yes 3+ Not Graduate No
## 220 LP001734 Female Yes 2 Graduate No
## 224 LP001749 Male Yes 0 Graduate No
## 233 LP001770 Male No 0 Not Graduate No
## 237 LP001786 Male Yes 0 Graduate
## 238 LP001788 Female No 0 Graduate Yes
## 260 LP001864 Male Yes 3+ Not Graduate No
## 261 LP001865 Male Yes 1 Graduate No
## 280 LP001908 Female Yes 0 Not Graduate No
## 285 LP001922 Male Yes 0 Graduate No
## 306 LP001990 Male No 0 Not Graduate No
## 310 LP001998 Male Yes 2 Not Graduate No
## 314 LP002008 Male Yes 2 Graduate Yes
## 318 LP002036 Male Yes 0 Graduate No
## 319 LP002043 Female No 1 Graduate No
## 323 LP002054 Male Yes 2 Not Graduate No
## 324 LP002055 Female No 0 Graduate No
## 336 LP002106 Male Yes Graduate Yes
## 339 LP002113 Female No 3+ Not Graduate No
## 349 LP002137 Male Yes 0 Graduate No
## 364 LP002178 Male Yes 0 Graduate No
## 368 LP002188 Male No 0 Graduate No
## 378 LP002223 Male Yes 0 Graduate No
## 388 LP002243 Male Yes 0 Not Graduate No
## 393 LP002263 Male Yes 0 Graduate No
## 396 LP002272 Male Yes 2 Graduate No
## 412 LP002319 Male Yes 0 Graduate
## 422 LP002357 Female No 0 Not Graduate No
## 424 LP002362 Male Yes 1 Graduate No
## 436 LP002393 Female Graduate No
## 438 LP002401 Male Yes 0 Graduate No
## 445 LP002424 Male Yes 0 Graduate No
## 450 LP002444 Male No 1 Not Graduate Yes
## 452 LP002447 Male Yes 2 Not Graduate No
## 461 LP002478 Yes 0 Graduate Yes
## 474 LP002522 Female No 0 Graduate Yes
## 480 LP002533 Male Yes 2 Graduate No
## 491 LP002560 Male No 0 Not Graduate No
## 492 LP002562 Male Yes 1 Not Graduate No
## 498 LP002588 Male Yes 0 Graduate No
## 504 LP002618 Male Yes 1 Not Graduate No
## 507 LP002624 Male Yes 0 Graduate No
## 525 LP002697 Male No 0 Graduate No
## 531 LP002717 Male Yes 0 Graduate No
## 534 LP002729 Male No 1 Graduate No
## 545 LP002757 Female Yes 0 Not Graduate No
## 551 LP002778 Male Yes 2 Graduate Yes
## 552 LP002784 Male Yes 1 Not Graduate No
## 557 LP002794 Female No 0 Graduate No
## 566 LP002833 Male Yes 0 Not Graduate No
## 584 LP002898 Male Yes 1 Graduate No
## 601 LP002949 Female No 3+ Graduate
## 606 LP002960 Male Yes 0 Not Graduate No
## ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term
## 1 5849 0 NA 360
## 17 3596 0 100 240
## 20 2600 3500 115 NA
## 25 3717 2925 151 360
## 31 4166 3369 201 360
## 36 2275 2067 NA 360
## 37 1828 1330 100 NA
## 43 2400 0 75 360
## 45 4695 0 96 NA
## 46 3410 0 88 NA
## 64 4945 0 NA 360
## 74 4755 0 95 NA
## 80 3333 2166 130 360
## 82 2395 0 NA 360
## 84 6000 2250 265 360
## 87 3333 2000 99 360
## 96 6782 0 NA 360
## 103 13650 0 NA 360
## 104 4652 3583 NA 360
## 113 3572 4114 152 NA
## 114 7451 0 NA 360
## 118 2214 1398 85 360
## 126 3692 0 93 360
## 128 3865 1640 NA 360
## 130 6080 2569 182 360
## 131 20166 0 650 480
## 157 6000 0 160 360
## 166 3707 3166 182 NA
## 182 1916 5063 67 360
## 188 2383 2138 58 360
## 198 1907 2365 120 NA
## 199 3416 2816 113 360
## 203 3992 0 NA 180
## 220 4283 2383 127 360
## 224 7578 1010 175 NA
## 233 3189 2598 120 NA
## 237 5746 0 255 360
## 238 3463 0 122 360
## 260 4931 0 128 360
## 261 6083 4250 330 360
## 280 4100 0 124 360
## 285 20667 0 NA 360
## 306 2000 0 NA 360
## 310 7667 0 185 360
## 314 5746 0 144 84
## 318 2058 2134 88 360
## 319 3541 0 112 360
## 323 3601 1590 NA 360
## 324 3166 2985 132 360
## 336 5503 4490 70 NA
## 339 1830 0 NA 360
## 349 6333 4583 259 360
## 364 3013 3033 95 300
## 368 5124 0 124 NA
## 378 4310 0 130 360
## 388 3010 3136 NA 360
## 393 2583 2115 120 360
## 396 3276 484 135 360
## 412 6256 0 160 360
## 422 2720 0 80 NA
## 424 7250 1667 110 NA
## 436 10047 0 NA 240
## 438 2213 1125 NA 360
## 445 7333 8333 175 300
## 450 2769 1542 190 360
## 452 1958 1456 60 300
## 461 2083 4083 160 360
## 474 2500 0 93 360
## 480 2947 1603 NA 360
## 491 2699 2785 96 360
## 492 5333 1131 186 360
## 498 4625 2857 111 12
## 504 4050 5302 138 360
## 507 20833 6667 480 360
## 525 4680 2087 NA 360
## 531 1025 5500 216 360
## 534 11250 0 196 360
## 545 3017 663 102 360
## 551 6633 0 NA 360
## 552 2492 2375 NA 360
## 557 2667 1625 84 360
## 566 4467 0 120 360
## 584 1880 0 61 360
## 601 416 41667 350 180
## 606 2400 3800 NA 180
## Credit_History Property_Area Loan_Status
## 1 1 Urban Y
## 17 NA Urban Y
## 20 1 Urban Y
## 25 NA Semiurban N
## 31 NA Urban N
## 36 1 Urban Y
## 37 0 Urban N
## 43 NA Urban Y
## 45 1 Urban Y
## 46 1 Urban Y
## 64 0 Rural N
## 74 0 Semiurban N
## 80 NA Semiurban Y
## 82 1 Semiurban Y
## 84 NA Semiurban N
## 87 NA Semiurban Y
## 96 NA Urban N
## 103 1 Urban Y
## 104 1 Semiurban Y
## 113 0 Rural N
## 114 1 Semiurban Y
## 118 NA Urban Y
## 126 NA Rural Y
## 128 1 Rural Y
## 130 NA Rural N
## 131 NA Urban Y
## 157 NA Rural Y
## 166 1 Rural Y
## 182 NA Rural N
## 188 NA Rural Y
## 198 1 Urban Y
## 199 NA Semiurban Y
## 203 1 Urban N
## 220 NA Semiurban Y
## 224 1 Semiurban Y
## 233 1 Rural Y
## 237 NA Urban N
## 238 NA Urban Y
## 260 NA Semiurban N
## 261 NA Urban Y
## 280 NA Rural Y
## 285 1 Rural N
## 306 1 Urban N
## 310 NA Rural Y
## 314 NA Rural Y
## 318 NA Urban Y
## 319 NA Semiurban Y
## 323 1 Rural Y
## 324 NA Rural Y
## 336 1 Semiurban Y
## 339 0 Urban N
## 349 NA Semiurban Y
## 364 NA Urban Y
## 368 0 Rural N
## 378 NA Semiurban Y
## 388 0 Urban N
## 393 NA Urban Y
## 396 NA Semiurban Y
## 412 NA Urban Y
## 422 0 Urban N
## 424 0 Urban N
## 436 1 Semiurban Y
## 438 1 Urban Y
## 445 NA Rural Y
## 450 NA Semiurban N
## 452 NA Urban Y
## 461 NA Semiurban Y
## 474 NA Urban Y
## 480 1 Urban N
## 491 NA Semiurban Y
## 492 NA Urban Y
## 498 NA Urban Y
## 504 NA Rural N
## 507 NA Urban Y
## 525 1 Semiurban N
## 531 NA Rural Y
## 534 NA Semiurban N
## 545 NA Semiurban Y
## 551 0 Rural N
## 552 1 Rural Y
## 557 NA Urban Y
## 566 NA Rural Y
## 584 NA Rural N
## 601 NA Urban N
## 606 1 Urban N
#store only data without missing values (removed 85 rows)
my_loan_data<- my_loan_data[complete.cases(my_loan_data),]
dim(my_loan_data)## [1] 529 13
# create a new column trg by Loan_Status column Y=1, N=0
my_loan_data<-my_loan_data%>%
mutate(trg=ifelse(my_loan_data$Loan_Status=='Y',1,0))
# remove Loan_status column
my_loan_data <- subset( my_loan_data, select = -Loan_Status )
# rename the last column to Loan_Status.
colnames(my_loan_data)[13] <- 'Loan_Status'Dream Housing Finance company deals in all home loans. They have presence across all urban, semi urban and rural areas. Customer first apply for home loan after that company validates the customer eligibility for loan.
Problem: Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form. These details are Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others. To automate this process, they have given a problem to identify the customers segments, those are eligible for loan amount so that they can specifically target these customers
There are 614 cases and 13 columns. Each case represents a loan application.
This data source was given as part of a data science challenge. I downloaded the data and loaded to my git-hub account. I will read the data into R.
Source: https://datahack.analyticsvidhya.com/contest/practice-problem-loan-prediction-iii/.
Observational
Source: https://datahack.analyticsvidhya.com/contest/practice-problem-loan-prediction-iii/
Loan_status is the dependent variable. It is a categorical variable which gives us yes and no for loan approval status
There are a few independent variables. I will choose the most appropriate variables after doing exploratory analysis. Here are some preliminary variables listed below: Credit history –the applicant income variable –Applicants with higher education. –Gender of the applicant. –Number of Dependents –Property area
str(my_loan_data)## 'data.frame': 529 obs. of 13 variables:
## $ Loan_ID : chr "LP001003" "LP001005" "LP001006" "LP001008" ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Married : chr "Yes" "Yes" "Yes" "No" ...
## $ Dependents : chr "1" "0" "0" "0" ...
## $ Education : chr "Graduate" "Graduate" "Not Graduate" "Graduate" ...
## $ Self_Employed : chr "No" "Yes" "No" "No" ...
## $ ApplicantIncome : int 4583 3000 2583 6000 5417 2333 3036 4006 12841 3200 ...
## $ CoapplicantIncome: num 1508 0 2358 0 4196 ...
## $ LoanAmount : int 128 66 120 141 267 95 158 168 349 70 ...
## $ Loan_Amount_Term : int 360 360 360 360 360 360 360 360 360 360 ...
## $ Credit_History : int 1 1 1 1 1 1 0 1 1 1 ...
## $ Property_Area : chr "Rural" "Urban" "Urban" "Urban" ...
## $ Loan_Status : num 0 1 1 1 1 1 0 1 0 1 ...
describe(my_loan_data)## vars n mean sd median trimmed mad min max
## Loan_ID* 1 529 265.00 152.85 265 265.00 195.70 1 529
## Gender* 2 529 2.78 0.47 3 2.87 0.00 1 3
## Married* 3 529 2.64 0.49 3 2.68 0.00 1 3
## Dependents* 4 529 2.74 1.05 2 2.60 0.00 1 5
## Education* 5 529 1.20 0.40 1 1.13 0.00 1 2
## Self_Employed* 6 529 2.09 0.42 2 2.04 0.00 1 3
## ApplicantIncome 7 529 5507.82 6404.13 3816 4346.45 1802.84 150 81000
## CoapplicantIncome 8 529 1542.39 2524.30 1086 1118.17 1610.10 0 33837
## LoanAmount 9 529 145.85 84.11 128 133.26 45.96 9 700
## Loan_Amount_Term 10 529 342.35 64.86 360 358.31 0.00 36 480
## Credit_History 11 529 0.85 0.36 1 0.94 0.00 0 1
## Property_Area* 12 529 2.02 0.78 2 2.02 1.48 1 3
## Loan_Status 13 529 0.69 0.46 1 0.74 0.00 0 1
## range skew kurtosis se
## Loan_ID* 528 0.00 -1.21 6.65
## Gender* 2 -1.95 3.03 0.02
## Married* 2 -0.67 -1.31 0.02
## Dependents* 4 0.86 -0.49 0.05
## Education* 1 1.46 0.14 0.02
## Self_Employed* 2 0.56 2.31 0.02
## ApplicantIncome 80850 6.43 56.78 278.44
## CoapplicantIncome 33837 5.96 60.12 109.75
## LoanAmount 691 2.59 9.94 3.66
## Loan_Amount_Term 444 -2.26 6.06 2.82
## Credit_History 1 -1.96 1.85 0.02
## Property_Area* 2 -0.03 -1.35 0.03
## Loan_Status 1 -0.83 -1.32 0.02
summary(my_loan_data$Property_Area)## Length Class Mode
## 529 character character
ggplot(data=my_loan_data, aes(my_loan_data$Property_Area)) +
geom_histogram(col="blue",fill="lightblue",stat="count" ) +
facet_grid(~my_loan_data$Loan_Status)+
scale_x_discrete()## Warning in geom_histogram(col = "blue", fill = "lightblue", stat = "count"):
## Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
summary(my_loan_data$CoapplicantIncome)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 1086 1542 2232 33837
ggplot(data=my_loan_data, aes(x= my_loan_data$CoapplicantIncome)) +
geom_histogram(col="yellow",fill="pink", bins = 15) +
facet_grid(~my_loan_data$Loan_Status)+
theme_bw()summary(my_loan_data$Education)## Length Class Mode
## 529 character character
ggplot(data=my_loan_data, aes(my_loan_data$Education)) +
geom_histogram(col="lightgreen",fill="blue",stat="count" ) +
facet_grid(~my_loan_data$Loan_Status)+
scale_x_discrete()+
theme_bw()## Warning in geom_histogram(col = "lightgreen", fill = "blue", stat = "count"):
## Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
summary(my_loan_data$Dependents)## Length Class Mode
## 529 character character
ggplot(data=my_loan_data, aes(my_loan_data$Dependents)) +
geom_histogram(col="lightyellow",fill="lightgreen",stat="count" ) +
facet_grid(~my_loan_data$Loan_Status)+
scale_x_discrete()+
theme_bw()## Warning in geom_histogram(col = "lightyellow", fill = "lightgreen", stat =
## "count"): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
summary(my_loan_data$Gender)## Length Class Mode
## 529 character character
ggplot(data=my_loan_data, aes(my_loan_data$Gender)) +
geom_histogram(col="lightgrey",fill="lightblue",stat="count") +
facet_grid(~my_loan_data$Loan_Status)+
scale_x_discrete()+
theme_bw()## Warning in geom_histogram(col = "lightgrey", fill = "lightblue", stat =
## "count"): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`