Loan Data Analysis
Objective of this analysis, is to explore the loan data set provided on Kaggle for Q1 2017 from the Lending Club. We will go through data cleaning and then explore potantial relationship between features and bad loans in order to find a way of predicting bad loans. Not only it is useful for avoiding bad loans, but also for services personalization. The data can be found under https://data.world/lpetrocelli/lendingclub-loan-data-2017-q-1
The csv file is separated by comma and values are between quotes, on top we need to clean some non standard characters, this is a bit more complicated than usual
lines <- readLines("LoanStats_2017Q1 2.csv")
lines <-str_replace_all(lines,"[^[:graph:]]", " ")
lines <- gsub('(^"|"$)', "", lines)
lines <- gsub("\"","" ,lines)
loans<-read.csv(textConnection(lines), sep=',',stringsAsFactors = FALSE)
Let us remove some unwanted columns and let us remove some columns we believe are not relevant. For now we do not need IDs, zip codes and states.
removedColumns<-select(loans, c(id,member_id,emp_title, zip_code,addr_state))
loans<-select(loans, -c(id,member_id,emp_title))
loans<-select(loans, -c(url,desc,purpose,title,
sec_app_earliest_cr_line,
sec_app_inq_last_6mths,
sec_app_mths_since_last_major_derog.))
Convert percentage columns into numeric columns and remove the % sign to get numeric values.
loans$int_rate<-as.numeric(gsub("[\\%]", "", loans$int_rate))
loans$revol_util<-as.numeric(gsub("[\\%]", "", loans$revol_util))
loans$total_acc<-as.numeric(gsub("[\\%]", "", loans$total_acc))
Get employment length and remove years to get cleaner data.
loans$emp_length<-gsub("[^0-9+<]","",loans$emp_length)
loans$emp_length<-as.factor(loans$emp_length)
Identify numeric columns and convert them to numeric
numCols=c("annual_inc","loan_amnt","funded_amnt", "out_prncp","out_prncp_inv","dti","delinq_2yrs",
"mths_since_last_delinq","mths_since_last_record","earliest_cr_line",
"inq_last_6mths","total_pymnt","last_pymnt_amnt", "annual_inc_joint","dti_joint","mths_since_last_major_derog")
for (col in numCols) {
loans[[col]]<-as.numeric(loans[[col]])
}
Remove NA values from known columns so far and replace by 0. Then identify numeric columns for further analysis.
# remove NA values from known columns so far and replace by 0
loans$num_accts_ever_120_pd[is.na(loans$num_accts_ever_120_pd)]<-0
loans$num_tl_120dpd_2m[is.na(loans$num_tl_120dpd_2m)]<-0
loans$num_tl_30dpd[is.na(loans$num_tl_30dpd)]<-0
loans$num_tl_90g_dpd_24m[is.na(loans$num_tl_90g_dpd_24m)]<-0
loans$num_tl_op_past_12m[is.na(loans$num_tl_op_past_12m)]<-0
loans$delinq_2yrs[is.na(loans$delinq_2yrs)]<-0
loans$mths_since_last_delinq[is.na(loans$mths_since_last_delinq)]<-0
loans$mths_since_last_major_derog[is.na(loans$mths_since_last_major_derog)]<-0
# figure out which columns are numeric so that we can look at the distribution
numeric_cols <- sapply(loans, is.numeric)
Identify bad loans and add a flag for bad loans based on the indicators found in loan_status column.
# 'bad' statuses
badIndicators <- c("Charged Off",
"Default",
"Does not meet the credit policy. Status:Charged Off",
"In Grace Period",
"Default Receiver",
"Late (16-30 days)",
"Late (31-120 days)")
# assign certain statuses to a 'bad' ('1') group
loans$is_bad <- ifelse(loans$loan_status %in% badIndicators, 1,0)
loans$is_bad<-as.factor(loans$is_bad)
loans<-select(loans,-loan_status)
We have a very small percentage of bad loans, so the data set is highly imbalanced.
First we determine percentage of NAs in each column, then remove columns where NA values represent more than 80% For other columns we replace NA values with 0 for now
naPercentage <-sapply(loans, function(y) sum(length(which(is.na(y))))/length(y))
# let us remove all columns with more than 80% NAs
AllNA <- which(naPercentage>0.80)
loans<-loans[,-AllNA]
naPercentage <-sapply(loans, function(y) sum(length(which(is.na(y))))/length(y))
colnames(loans)[colSums(is.na(loans)) > 0]
## [1] "loan_amnt" "funded_amnt"
## [3] "funded_amnt_inv" "int_rate"
## [5] "installment" "annual_inc"
## [7] "dti" "inq_last_6mths"
## [9] "open_acc" "pub_rec"
## [11] "revol_bal" "revol_util"
## [13] "total_acc" "out_prncp"
## [15] "out_prncp_inv" "total_pymnt"
## [17] "total_pymnt_inv" "total_rec_prncp"
## [19] "total_rec_int" "total_rec_late_fee"
## [21] "recoveries" "collection_recovery_fee"
## [23] "last_pymnt_amnt" "tot_coll_amt"
## [25] "tot_cur_bal" "open_acc_6m"
## [27] "open_il_6m" "open_il_12m"
## [29] "open_il_24m" "mths_since_rcnt_il"
## [31] "total_bal_il" "il_util"
## [33] "open_rv_12m" "open_rv_24m"
## [35] "max_bal_bc" "all_util"
## [37] "total_rev_hi_lim" "inq_fi"
## [39] "total_cu_tl" "inq_last_12m"
## [41] "acc_open_past_24mths" "avg_cur_bal"
## [43] "bc_open_to_buy" "bc_util"
## [45] "chargeoff_within_12_mths" "delinq_amnt"
## [47] "mo_sin_old_il_acct" "mo_sin_old_rev_tl_op"
## [49] "mo_sin_rcnt_rev_tl_op" "mo_sin_rcnt_tl"
## [51] "mort_acc" "mths_since_recent_bc"
## [53] "mths_since_recent_bc_dlq" "mths_since_recent_inq"
## [55] "mths_since_recent_revol_delinq" "num_actv_bc_tl"
## [57] "num_actv_rev_tl" "num_bc_sats"
## [59] "num_bc_tl" "num_il_tl"
## [61] "num_op_rev_tl" "num_rev_accts"
## [63] "num_rev_tl_bal_gt_0" "num_sats"
## [65] "pct_tl_nvr_dlq" "percent_bc_gt_75"
## [67] "pub_rec_bankruptcies" "tax_liens"
## [69] "tot_hi_cred_lim" "total_bal_ex_mort"
## [71] "total_bc_limit" "total_il_high_credit_limit"
loans[is.na(loans)] <- 0
Convert character columns to factors for later use
loans <- mutate_if(loans, is.character, as.factor)
Let us look at few distributions, first by loan amount and second by installment amount
fig <- plot_ly(x = loans$loan_amnt, type = "histogram")
fig
Let us look at interest rate by grade
fig <- plot_ly(loans, y = ~int_rate, color = ~ grade, type = "box")
fig