loandata <- read.csv("./loan.csv")
dim(loandata)
## [1] 887379 74
names(loandata)
## [1] "id" "member_id"
## [3] "loan_amnt" "funded_amnt"
## [5] "funded_amnt_inv" "term"
## [7] "int_rate" "installment"
## [9] "grade" "sub_grade"
## [11] "emp_title" "emp_length"
## [13] "home_ownership" "annual_inc"
## [15] "verification_status" "issue_d"
## [17] "loan_status" "pymnt_plan"
## [19] "url" "desc"
## [21] "purpose" "title"
## [23] "zip_code" "addr_state"
## [25] "dti" "delinq_2yrs"
## [27] "earliest_cr_line" "inq_last_6mths"
## [29] "mths_since_last_delinq" "mths_since_last_record"
## [31] "open_acc" "pub_rec"
## [33] "revol_bal" "revol_util"
## [35] "total_acc" "initial_list_status"
## [37] "out_prncp" "out_prncp_inv"
## [39] "total_pymnt" "total_pymnt_inv"
## [41] "total_rec_prncp" "total_rec_int"
## [43] "total_rec_late_fee" "recoveries"
## [45] "collection_recovery_fee" "last_pymnt_d"
## [47] "last_pymnt_amnt" "next_pymnt_d"
## [49] "last_credit_pull_d" "collections_12_mths_ex_med"
## [51] "mths_since_last_major_derog" "policy_code"
## [53] "application_type" "annual_inc_joint"
## [55] "dti_joint" "verification_status_joint"
## [57] "acc_now_delinq" "tot_coll_amt"
## [59] "tot_cur_bal" "open_acc_6m"
## [61] "open_il_6m" "open_il_12m"
## [63] "open_il_24m" "mths_since_rcnt_il"
## [65] "total_bal_il" "il_util"
## [67] "open_rv_12m" "open_rv_24m"
## [69] "max_bal_bc" "all_util"
## [71] "total_rev_hi_lim" "inq_fi"
## [73] "total_cu_tl" "inq_last_12m"
library(ggplot2)
ggplot(loandata, aes(loan_amnt, col = grade)) + geom_histogram(bins = 50) + facet_grid(grade ~ .)
Those with higher grades (A, B, C and D) have received more loans compared to those with lower grades (E, F and G).
library(ggplot2)
ggplot(loandata, aes(int_rate, fill = grade)) + geom_density() + facet_grid(grade ~ .)
Grades are assigned based on risk, and so interest rates go up as the risk goes up.
library(lubridate)
loandata$issue_d <- dmy(paste0("01-",loandata$issue_d))
loan_amnt_by_month <- aggregate(loan_amnt ~ issue_d, data = loandata, sum)
ggplot(loan_amnt_by_month, aes(issue_d, loan_amnt)) + geom_bar(stat = "identity")
loan_amnt_by_status <- aggregate(loan_amnt ~ loan_status, data = loandata, sum)
ggplot(loan_amnt_by_status, aes(loan_status, loan_amnt, fill = loan_status)) + geom_bar(stat = "identity") + scale_x_discrete(breaks=NULL)
Description for the loan statuses can be found in this link
ggplot(loandata, aes(loan_status, loan_amnt, fill = loan_status)) + geom_boxplot() + scale_x_discrete(breaks=NULL)
Creating a new column with 2 factor levels.
- “Paid/current” - Represents the status is Current or Fully Paid.
- “Other” - Represents defaults, chargeroff and other status.
loandata$paidVsUnpaid <- "Other"
loandata$paidVsUnpaid[which(loandata$loan_status == "Fully Paid" | loandata$loan_status == "Current" | loandata$loan_status == "Does not meet the credit policy. Status:Fully Paid") ] <- "Paid/current"
loandata$paidVsUnpaid <- factor(loandata$paidVsUnpaid)
loandata$paidVsUnpaid <- factor(loandata$paidVsUnpaid, levels = rev(levels(loandata$paidVsUnpaid)))
table(loandata$paidVsUnpaid)
##
## Paid/current Other
## 811490 75889
ggplot(loandata, aes(paidVsUnpaid, loan_amnt, fill = paidVsUnpaid)) + geom_bar(stat = "identity")
loan_by_grade <- aggregate(loan_amnt ~ sub_grade + paidVsUnpaid, data = loandata, sum)
gbar <- ggplot(loan_by_grade, aes(sub_grade, loan_amnt, fill = paidVsUnpaid))
gbar + geom_bar(stat = "identity") + theme(axis.text.x=element_text(size=7))
gbar + geom_bar(position = "fill", stat = "identity") + theme(axis.text.x=element_text(size=7))
It is very obvious that as the grade goes down, the proportion of the unpaid loan increases.
ggplot(loandata, aes(purpose, loan_amnt, fill = paidVsUnpaid)) + geom_boxplot() + theme(axis.text.x=element_text(size=8, angle = 90))
Loan_by_purpose <- aggregate(loan_amnt ~ purpose + paidVsUnpaid, data = loandata, sum)
ggplot(Loan_by_purpose, aes(purpose, loan_amnt, fill = paidVsUnpaid)) + geom_bar(position = "fill", stat = "identity") + theme(axis.text.x=element_text(size=8, angle = 90))
The probability of the loans for educational and small business being unpaid is nearly 25%.