importing data

loandata <- read.csv("./loan.csv")
dim(loandata)
## [1] 887379     74
names(loandata)
##  [1] "id"                          "member_id"                  
##  [3] "loan_amnt"                   "funded_amnt"                
##  [5] "funded_amnt_inv"             "term"                       
##  [7] "int_rate"                    "installment"                
##  [9] "grade"                       "sub_grade"                  
## [11] "emp_title"                   "emp_length"                 
## [13] "home_ownership"              "annual_inc"                 
## [15] "verification_status"         "issue_d"                    
## [17] "loan_status"                 "pymnt_plan"                 
## [19] "url"                         "desc"                       
## [21] "purpose"                     "title"                      
## [23] "zip_code"                    "addr_state"                 
## [25] "dti"                         "delinq_2yrs"                
## [27] "earliest_cr_line"            "inq_last_6mths"             
## [29] "mths_since_last_delinq"      "mths_since_last_record"     
## [31] "open_acc"                    "pub_rec"                    
## [33] "revol_bal"                   "revol_util"                 
## [35] "total_acc"                   "initial_list_status"        
## [37] "out_prncp"                   "out_prncp_inv"              
## [39] "total_pymnt"                 "total_pymnt_inv"            
## [41] "total_rec_prncp"             "total_rec_int"              
## [43] "total_rec_late_fee"          "recoveries"                 
## [45] "collection_recovery_fee"     "last_pymnt_d"               
## [47] "last_pymnt_amnt"             "next_pymnt_d"               
## [49] "last_credit_pull_d"          "collections_12_mths_ex_med" 
## [51] "mths_since_last_major_derog" "policy_code"                
## [53] "application_type"            "annual_inc_joint"           
## [55] "dti_joint"                   "verification_status_joint"  
## [57] "acc_now_delinq"              "tot_coll_amt"               
## [59] "tot_cur_bal"                 "open_acc_6m"                
## [61] "open_il_6m"                  "open_il_12m"                
## [63] "open_il_24m"                 "mths_since_rcnt_il"         
## [65] "total_bal_il"                "il_util"                    
## [67] "open_rv_12m"                 "open_rv_24m"                
## [69] "max_bal_bc"                  "all_util"                   
## [71] "total_rev_hi_lim"            "inq_fi"                     
## [73] "total_cu_tl"                 "inq_last_12m"

Loan amount Distribution based on Grades assigned by Lending Club

library(ggplot2)
ggplot(loandata, aes(loan_amnt, col = grade)) + geom_histogram(bins = 50) + facet_grid(grade ~ .)

Those with higher grades (A, B, C and D) have received more loans compared to those with lower grades (E, F and G).

Exploring interest rates based on Grades assigned by Lending Club

library(ggplot2)
ggplot(loandata, aes(int_rate, fill = grade)) + geom_density() + facet_grid(grade ~ .)

Grades are assigned based on risk, and so interest rates go up as the risk goes up.

Total loan issued over the years [2007 - 2015]

library(lubridate)
loandata$issue_d <- dmy(paste0("01-",loandata$issue_d))
loan_amnt_by_month <- aggregate(loan_amnt ~ issue_d, data = loandata, sum)
ggplot(loan_amnt_by_month, aes(issue_d, loan_amnt)) + geom_bar(stat = "identity")

Total loan amount for each loan status

loan_amnt_by_status <- aggregate(loan_amnt ~ loan_status, data = loandata, sum)
ggplot(loan_amnt_by_status, aes(loan_status, loan_amnt, fill = loan_status)) + geom_bar(stat = "identity") + scale_x_discrete(breaks=NULL)

Description for the loan statuses can be found in this link

Distribution of the loan amount for each status

ggplot(loandata, aes(loan_status, loan_amnt, fill = loan_status)) + geom_boxplot() + scale_x_discrete(breaks=NULL)

Exploring Unpaid loans

ggplot(loandata, aes(paidVsUnpaid, loan_amnt, fill = paidVsUnpaid)) + geom_bar(stat = "identity")

Proportion of Paid Vs. Unpaid loan amount over the Grades

gbar + geom_bar(position = "fill", stat = "identity") + theme(axis.text.x=element_text(size=7))

It is very obvious that as the grade goes down, the proportion of the unpaid loan increases.

Distribution of the loan by different purposes

ggplot(loandata, aes(purpose, loan_amnt, fill = paidVsUnpaid)) + geom_boxplot() + theme(axis.text.x=element_text(size=8, angle = 90))

Proportion of Paid Vs. Unpaid loan amount by different purposes

Loan_by_purpose <- aggregate(loan_amnt ~ purpose + paidVsUnpaid, data = loandata, sum)
ggplot(Loan_by_purpose, aes(purpose, loan_amnt, fill = paidVsUnpaid)) + geom_bar(position = "fill", stat = "identity") + theme(axis.text.x=element_text(size=8, angle = 90))

The probability of the loans for educational and small business being unpaid is nearly 25%.