Loan Data Analysis

Introduction

Objective of this analysis, is to explore the loan data set provided on Kaggle for Q1 2017 from the Lending Club. We will go through data cleaning and then explore potantial relationship between features and bad loans in order to find a way of predicting bad loans. Not only it is useful for avoiding bad loans, but also for services personalization. The data can be found under https://data.world/lpetrocelli/lendingclub-loan-data-2017-q-1

Loading the data set

The csv file is separated by comma and values are between quotes, on top we need to clean some non standard characters, this is a bit more complicated than usual

lines <- readLines("LoanStats_2017Q1 2.csv")
lines <-str_replace_all(lines,"[^[:graph:]]", " ") 
lines <- gsub('(^"|"$)', "", lines)
lines <- gsub("\"","" ,lines)
loans<-read.csv(textConnection(lines), sep=',',stringsAsFactors = FALSE)

Transforming the data

Unnecessary columns

Let us remove some unwanted columns and let us remove some columns we believe are not relevant. For now we do not need IDs, zip codes and states.

removedColumns<-select(loans, c(id,member_id,emp_title, zip_code,addr_state))
loans<-select(loans, -c(id,member_id,emp_title))
loans<-select(loans, -c(url,desc,purpose,title,
                        sec_app_earliest_cr_line,
                        sec_app_inq_last_6mths,
                        sec_app_mths_since_last_major_derog.))

Convert percentages columns

Convert percentage columns into numeric columns and remove the % sign to get numeric values.

loans$int_rate<-as.numeric(gsub("[\\%]", "", loans$int_rate))
loans$revol_util<-as.numeric(gsub("[\\%]", "", loans$revol_util))
loans$total_acc<-as.numeric(gsub("[\\%]", "", loans$total_acc))

Clean employment length

Get employment length and remove years to get cleaner data.

loans$emp_length<-gsub("[^0-9+<]","",loans$emp_length)
loans$emp_length<-as.factor(loans$emp_length)

Identify numeric columns

Identify numeric columns and convert them to numeric

numCols=c("annual_inc","loan_amnt","funded_amnt", "out_prncp","out_prncp_inv","dti","delinq_2yrs",
        "mths_since_last_delinq","mths_since_last_record","earliest_cr_line",
        "inq_last_6mths","total_pymnt","last_pymnt_amnt", "annual_inc_joint","dti_joint","mths_since_last_major_derog")
          
for (col in numCols) {
  loans[[col]]<-as.numeric(loans[[col]])
}

Remove null values

Remove NA values from known columns so far and replace by 0. Then identify numeric columns for further analysis.

# remove NA values from known columns so far and replace by 0
loans$num_accts_ever_120_pd[is.na(loans$num_accts_ever_120_pd)]<-0  
loans$num_tl_120dpd_2m[is.na(loans$num_tl_120dpd_2m)]<-0  
loans$num_tl_30dpd[is.na(loans$num_tl_30dpd)]<-0  
loans$num_tl_90g_dpd_24m[is.na(loans$num_tl_90g_dpd_24m)]<-0 
loans$num_tl_op_past_12m[is.na(loans$num_tl_op_past_12m)]<-0 
loans$delinq_2yrs[is.na(loans$delinq_2yrs)]<-0 
loans$mths_since_last_delinq[is.na(loans$mths_since_last_delinq)]<-0
loans$mths_since_last_major_derog[is.na(loans$mths_since_last_major_derog)]<-0
# figure out which columns are numeric so that we can look at the distribution
numeric_cols <- sapply(loans, is.numeric)

Identify bad loans

Identify bad loans and add a flag for bad loans based on the indicators found in loan_status column.

# 'bad' statuses
badIndicators <- c("Charged Off",
                    "Default",
                    "Does not meet the credit policy. Status:Charged Off",
                    "In Grace Period", 
                    "Default Receiver",
                    "Late (16-30 days)",
                    "Late (31-120 days)")

# assign certain statuses to a 'bad' ('1') group
loans$is_bad <- ifelse(loans$loan_status %in% badIndicators, 1,0)
loans$is_bad<-as.factor(loans$is_bad)
loans<-select(loans,-loan_status)

We have a very small percentage of bad loans, so the data set is highly imbalanced.

Missing values

First we determine percentage of NAs in each column, then remove columns where NA values represent more than 80% For other columns we replace NA values with 0 for now

naPercentage <-sapply(loans, function(y) sum(length(which(is.na(y))))/length(y))

# let us remove all columns with more than 80% NAs
AllNA <- which(naPercentage>0.80)
loans<-loans[,-AllNA]
naPercentage <-sapply(loans, function(y) sum(length(which(is.na(y))))/length(y))
colnames(loans)[colSums(is.na(loans)) > 0]
##  [1] "loan_amnt"                      "funded_amnt"                   
##  [3] "funded_amnt_inv"                "int_rate"                      
##  [5] "installment"                    "annual_inc"                    
##  [7] "dti"                            "inq_last_6mths"                
##  [9] "open_acc"                       "pub_rec"                       
## [11] "revol_bal"                      "revol_util"                    
## [13] "total_acc"                      "out_prncp"                     
## [15] "out_prncp_inv"                  "total_pymnt"                   
## [17] "total_pymnt_inv"                "total_rec_prncp"               
## [19] "total_rec_int"                  "total_rec_late_fee"            
## [21] "recoveries"                     "collection_recovery_fee"       
## [23] "last_pymnt_amnt"                "tot_coll_amt"                  
## [25] "tot_cur_bal"                    "open_acc_6m"                   
## [27] "open_il_6m"                     "open_il_12m"                   
## [29] "open_il_24m"                    "mths_since_rcnt_il"            
## [31] "total_bal_il"                   "il_util"                       
## [33] "open_rv_12m"                    "open_rv_24m"                   
## [35] "max_bal_bc"                     "all_util"                      
## [37] "total_rev_hi_lim"               "inq_fi"                        
## [39] "total_cu_tl"                    "inq_last_12m"                  
## [41] "acc_open_past_24mths"           "avg_cur_bal"                   
## [43] "bc_open_to_buy"                 "bc_util"                       
## [45] "chargeoff_within_12_mths"       "delinq_amnt"                   
## [47] "mo_sin_old_il_acct"             "mo_sin_old_rev_tl_op"          
## [49] "mo_sin_rcnt_rev_tl_op"          "mo_sin_rcnt_tl"                
## [51] "mort_acc"                       "mths_since_recent_bc"          
## [53] "mths_since_recent_bc_dlq"       "mths_since_recent_inq"         
## [55] "mths_since_recent_revol_delinq" "num_actv_bc_tl"                
## [57] "num_actv_rev_tl"                "num_bc_sats"                   
## [59] "num_bc_tl"                      "num_il_tl"                     
## [61] "num_op_rev_tl"                  "num_rev_accts"                 
## [63] "num_rev_tl_bal_gt_0"            "num_sats"                      
## [65] "pct_tl_nvr_dlq"                 "percent_bc_gt_75"              
## [67] "pub_rec_bankruptcies"           "tax_liens"                     
## [69] "tot_hi_cred_lim"                "total_bal_ex_mort"             
## [71] "total_bc_limit"                 "total_il_high_credit_limit"
loans[is.na(loans)] <- 0

Convert characters to factors

Convert character columns to factors for later use

loans <- mutate_if(loans, is.character, as.factor)

Exploring the data using plotly

Let us look at few distributions, first by loan amount and second by installment amount

fig <- plot_ly(x = loans$loan_amnt, type = "histogram")
fig

Let us look at interest rate by grade

fig <- plot_ly(loans, y = ~int_rate, color = ~ grade, type = "box")
fig