0.1 About the dataset

These files contain complete loan data for all loans issued through the 2007-2015, including the current loan status (Current, Late, Fully Paid, etc.) and latest payment information. The file containing loan data through the “present” contains complete loan data for all loans issued through the previous completed calendar quarter. Additional features include credit scores, number of finance inquiries, address including zip codes, and state, and collections among others. The file is a matrix of about 890 thousand observations and 75 variables. A data dictionary is provided in a separate file.

getwd()
## [1] "C:/Users/Administrator/Desktop/BIG DATA"
setwd("C:\\Users\\Administrator\\Desktop\\BIG DATA\\06 신용대출")
loan<-read.csv("loan.csv",header =TRUE)
head(loan,5)
##        id member_id loan_amnt funded_amnt funded_amnt_inv       term int_rate
## 1 1077501   1296599      5000        5000            4975  36 months    10.65
## 2 1077430   1314167      2500        2500            2500  60 months    15.27
## 3 1077175   1313524      2400        2400            2400  36 months    15.96
## 4 1076863   1277178     10000       10000           10000  36 months    13.49
## 5 1075358   1311748      3000        3000            3000  60 months    12.69
##   installment grade sub_grade                emp_title emp_length
## 1      162.87     B        B2                           10+ years
## 2       59.83     C        C4                    Ryder   < 1 year
## 3       84.33     C        C5                           10+ years
## 4      339.31     C        C1      AIR RESOURCES BOARD  10+ years
## 5       67.79     B        B5 University Medical Group     1 year
##   home_ownership annual_inc verification_status  issue_d loan_status pymnt_plan
## 1           RENT    24000.0            Verified Dec-2011  Fully Paid          n
## 2           RENT    30000.0     Source Verified Dec-2011 Charged Off          n
## 3           RENT    12252.0        Not Verified Dec-2011  Fully Paid          n
## 4           RENT    49200.0     Source Verified Dec-2011  Fully Paid          n
## 5           RENT    80000.0     Source Verified Dec-2011     Current          n
##                                                                    url
## 1 https://www.lendingclub.com/browse/loanDetail.action?loan_id=1077501
## 2 https://www.lendingclub.com/browse/loanDetail.action?loan_id=1077430
## 3 https://www.lendingclub.com/browse/loanDetail.action?loan_id=1077175
## 4 https://www.lendingclub.com/browse/loanDetail.action?loan_id=1076863
## 5 https://www.lendingclub.com/browse/loanDetail.action?loan_id=1075358
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             desc
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   Borrower added on 12/22/11 > I need to upgrade my business technologies.<br>
## 2   Borrower added on 12/22/11 > I plan to use this money to finance the motorcycle i am looking at. I plan to have it paid off as soon as possible/when i sell my old bike. I only need this money because the deal im looking at is to good to pass up.<br><br>  Borrower added on 12/22/11 > I plan to use this money to finance the motorcycle i am looking at. I plan to have it paid off as soon as possible/when i sell my old bike.I only need this money because the deal im looking at is to good to pass up. I have finished college with an associates degree in business and its takingmeplaces<br>
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
## 4                                                                                                                                                                                                                                                                                                                                                                                                                             Borrower added on 12/21/11 > to pay for property tax (borrow from friend, need to pay back) & central A/C need to be replace. I'm very sorry to let my loan expired last time.<br>
## 5                                                                                                                                                                                                                                                                                               Borrower added on 12/21/11 > I plan on combining three large interest bills together and freeing up some extra each month to pay toward other bills.  I've always been a good payor but have found myself needing to make adjustments to my budget due to a medical scare. My job is very stable, I love it.<br>
##          purpose                title zip_code addr_state   dti delinq_2yrs
## 1    credit_card             Computer    860xx         AZ 27.65         0.0
## 2            car                 bike    309xx         GA  1.00         0.0
## 3 small_business real estate business    606xx         IL  8.72         0.0
## 4          other             personel    917xx         CA 20.00         0.0
## 5          other             Personal    972xx         OR 17.94         0.0
##   earliest_cr_line inq_last_6mths mths_since_last_delinq mths_since_last_record
## 1         Jan-1985              1                     NA                     NA
## 2         Apr-1999              5                     NA                     NA
## 3         Nov-2001              2                     NA                     NA
## 4         Feb-1996              1                     35                     NA
## 5         Jan-1996              0                     38                     NA
##   open_acc pub_rec revol_bal revol_util total_acc initial_list_status out_prncp
## 1        3       0     13648       83.7       9.0                   f       0.0
## 2        3       0      1687        9.4       4.0                   f       0.0
## 3        2       0      2956       98.5      10.0                   f       0.0
## 4       10       0      5598       21.0      37.0                   f       0.0
## 5       15       0     27783       53.9      38.0                   f     766.9
##   out_prncp_inv total_pymnt total_pymnt_inv total_rec_prncp total_rec_int
## 1           0.0    5861.071         5831.78         5000.00        861.07
## 2           0.0    1008.710         1008.71          456.46        435.17
## 3           0.0    3003.654         3003.65         2400.00        603.65
## 4           0.0   12226.302        12226.30        10000.00       2209.33
## 5         766.9    3242.170         3242.17         2233.10       1009.07
##   total_rec_late_fee recoveries collection_recovery_fee last_pymnt_d
## 1               0.00       0.00                     0.0     Jan-2015
## 2               0.00     117.08                    1.11     Apr-2013
## 3               0.00       0.00                     0.0     Jun-2014
## 4              16.97       0.00                     0.0     Jan-2015
## 5               0.00       0.00                     0.0     Jan-2016
##   last_pymnt_amnt next_pymnt_d last_credit_pull_d collections_12_mths_ex_med
## 1          171.62                        Jan-2016                          0
## 2          119.66                        Sep-2013                          0
## 3          649.91                        Jan-2016                          0
## 4          357.48                        Jan-2015                          0
## 5           67.79     Feb-2016           Jan-2016                          0
##   mths_since_last_major_derog policy_code application_type annual_inc_joint
## 1                          NA         1.0       INDIVIDUAL               NA
## 2                          NA         1.0       INDIVIDUAL               NA
## 3                          NA         1.0       INDIVIDUAL               NA
## 4                          NA         1.0       INDIVIDUAL               NA
## 5                          NA         1.0       INDIVIDUAL               NA
##   dti_joint verification_status_joint acc_now_delinq tot_coll_amt tot_cur_bal
## 1        NA                                        0           NA          NA
## 2        NA                                        0           NA          NA
## 3        NA                                        0           NA          NA
## 4        NA                                        0           NA          NA
## 5        NA                                        0           NA          NA
##   open_acc_6m open_il_6m open_il_12m open_il_24m mths_since_rcnt_il
## 1          NA         NA          NA          NA                 NA
## 2          NA         NA          NA          NA                 NA
## 3          NA         NA          NA          NA                 NA
## 4          NA         NA          NA          NA                 NA
## 5          NA         NA          NA          NA                 NA
##   total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc all_util
## 1           NA      NA          NA          NA         NA       NA
## 2           NA      NA          NA          NA         NA       NA
## 3           NA      NA          NA          NA         NA       NA
## 4           NA      NA          NA          NA         NA       NA
## 5           NA      NA          NA          NA         NA       NA
##   total_rev_hi_lim inq_fi total_cu_tl inq_last_12m
## 1               NA     NA          NA           NA
## 2               NA     NA          NA           NA
## 3               NA     NA          NA           NA
## 4               NA     NA          NA           NA
## 5               NA     NA          NA           NA

0.2 Checking the data

dim(loan)
## [1] 887379     74
names(loan)
##  [1] "id"                          "member_id"                  
##  [3] "loan_amnt"                   "funded_amnt"                
##  [5] "funded_amnt_inv"             "term"                       
##  [7] "int_rate"                    "installment"                
##  [9] "grade"                       "sub_grade"                  
## [11] "emp_title"                   "emp_length"                 
## [13] "home_ownership"              "annual_inc"                 
## [15] "verification_status"         "issue_d"                    
## [17] "loan_status"                 "pymnt_plan"                 
## [19] "url"                         "desc"                       
## [21] "purpose"                     "title"                      
## [23] "zip_code"                    "addr_state"                 
## [25] "dti"                         "delinq_2yrs"                
## [27] "earliest_cr_line"            "inq_last_6mths"             
## [29] "mths_since_last_delinq"      "mths_since_last_record"     
## [31] "open_acc"                    "pub_rec"                    
## [33] "revol_bal"                   "revol_util"                 
## [35] "total_acc"                   "initial_list_status"        
## [37] "out_prncp"                   "out_prncp_inv"              
## [39] "total_pymnt"                 "total_pymnt_inv"            
## [41] "total_rec_prncp"             "total_rec_int"              
## [43] "total_rec_late_fee"          "recoveries"                 
## [45] "collection_recovery_fee"     "last_pymnt_d"               
## [47] "last_pymnt_amnt"             "next_pymnt_d"               
## [49] "last_credit_pull_d"          "collections_12_mths_ex_med" 
## [51] "mths_since_last_major_derog" "policy_code"                
## [53] "application_type"            "annual_inc_joint"           
## [55] "dti_joint"                   "verification_status_joint"  
## [57] "acc_now_delinq"              "tot_coll_amt"               
## [59] "tot_cur_bal"                 "open_acc_6m"                
## [61] "open_il_6m"                  "open_il_12m"                
## [63] "open_il_24m"                 "mths_since_rcnt_il"         
## [65] "total_bal_il"                "il_util"                    
## [67] "open_rv_12m"                 "open_rv_24m"                
## [69] "max_bal_bc"                  "all_util"                   
## [71] "total_rev_hi_lim"            "inq_fi"                     
## [73] "total_cu_tl"                 "inq_last_12m"

0.3 Missing Value

NA 결측 Plot 으로 확인

na_lineplot <- gg_miss_var(loan)
na_lineplot

- 10% 이상이 누락된 데이터를 제거, 사용하지 않은 Col 삭제

loan_cleaned <- loan[, -which(colMeans(is.na(loan))>0.1)]
loan_cleaned <-within(loan_cleaned, rm('member_id', 'id', 'url', 'emp_title', "title","zip_code"))

0.4 Converting variables

  • 변수 삭제하기
loan_cleaned <- within(loan_cleaned, rm("policy_code", "pymnt_plan"))

0.5 EDA

Exploratory Data Analysis EDA results are displayed with GGplot visualisation, through which the data interpretation are described.

0.5.1 Issued_date & Loan

The total amount of loan and the mean of loan are shown by year variables.

loan_cleaned$issue_year <- substr(loan_cleaned$issue_d, 5,8)
loan_cleaned$new_issue_d <- parse_date_time(gsub("^", "01-", loan_cleaned$issue_d), orders = c("d-m-y", "d-B-Y", "m/d/y"))

Table_1<-loan_cleaned%>%
  select(new_issue_d, loan_amnt)%>%
  group_by(new_issue_d)%>%
  summarise(amount=sum(loan_amnt))

Table_1 <-na.omit(Table_1)
Table_1
## # A tibble: 9 x 2
##   new_issue_d             amount
##   <dttm>                   <dbl>
## 1 2007-01-20 00:00:00    4977475
## 2 2008-01-20 00:00:00   21119250
## 3 2009-01-20 00:00:00   51928250
## 4 2010-01-20 00:00:00  131992550
## 5 2011-01-20 00:00:00  261683825
## 6 2012-01-20 00:00:00  718411025
## 7 2013-01-20 00:00:00 1981989225
## 8 2014-01-20 00:00:00 3503830175
## 9 2015-01-20 00:00:00 6417570175
Table_2 <-loan_cleaned%>%
  select(new_issue_d, loan_amnt)%>%
  group_by(new_issue_d)%>%
  summarise(m=mean(loan_amnt))

Table_2 <-na.omit(Table_2)
Table_2
## # A tibble: 9 x 2
##   new_issue_d              m
##   <dttm>               <dbl>
## 1 2007-01-20 00:00:00  8255.
## 2 2008-01-20 00:00:00  8825.
## 3 2009-01-20 00:00:00  9833.
## 4 2010-01-20 00:00:00 10528.
## 5 2011-01-20 00:00:00 12048.
## 6 2012-01-20 00:00:00 13462.
## 7 2013-01-20 00:00:00 14708.
## 8 2014-01-20 00:00:00 14870.
## 9 2015-01-20 00:00:00 15240.
Table_3<-merge(Table_1, Table_2, by='new_issue_d', all=FALSE)
kable(Table_3)
new_issue_d amount m
2007-01-20 4977475 8254.519
2008-01-20 21119250 8825.428
2009-01-20 51928250 9833.034
2010-01-20 131992550 10528.240
2011-01-20 261683825 12047.504
2012-01-20 718411025 13461.709
2013-01-20 1981989225 14708.094
2014-01-20 3503830175 14870.177
2015-01-20 6417570175 15240.268
p1<-ggplot(Table_3, aes(x=new_issue_d, y=amount))+
  geom_line(color="indianred3", size=1)+
  scale_y_continuous(labels = scales::comma)+
  geom_point(size=2)+
  labs(y="Amount of Loan", x="Year",
       title = "Amount of Loan by Year")+
  theme_economist()
  

  
p2<-ggplot(Table_3, aes(x=new_issue_d, y=m))+
  geom_line(color="lightgrey", size=1)+
  scale_y_continuous(labels = scales::comma)+
  geom_point(size=2, col="blue")+
  labs(y="Mean of Loan", x="Year",
       title = " Mean of Loan by Year")+
  theme_economist()
  
  
grid.arrange(p1,p2, nrow=2)

  • The mean of loan amount by year and purpose
Mean_purpose <-loan_cleaned%>%
  group_by(new_issue_d, purpose)%>%
  summarise(loan_mean = mean(loan_amnt))

ggplot(Mean_purpose, aes(x=new_issue_d, y= loan_mean, color=purpose))+
  geom_line(size=1.5)+
  theme_economist()

  • The Mean of loan amount by year and grade
Mean_year_grade<-loan_cleaned%>%
  group_by(new_issue_d, grade)%>%
  summarise(loan_mean = mean(loan_amnt))

Mean_year_grade
## # A tibble: 64 x 3
## # Groups:   new_issue_d [10]
##    new_issue_d         grade loan_mean
##    <dttm>              <fct>     <dbl>
##  1 2007-01-20 00:00:00 A         5015.
##  2 2007-01-20 00:00:00 B         8338.
##  3 2007-01-20 00:00:00 C         8276.
##  4 2007-01-20 00:00:00 D         8006.
##  5 2007-01-20 00:00:00 E         8138.
##  6 2007-01-20 00:00:00 F        11851.
##  7 2007-01-20 00:00:00 G        10848.
##  8 2008-01-20 00:00:00 A         6235.
##  9 2008-01-20 00:00:00 B         9175.
## 10 2008-01-20 00:00:00 C         8840.
## # ... with 54 more rows
ggplot(Mean_year_grade, aes(x=new_issue_d, y= loan_mean, color=grade))+
  geom_line(size=1)+
  theme_economist()

  • The Mean of loan amount by Home ownership
home_amount<-loan_cleaned%>%
  group_by(new_issue_d, home_ownership)%>%
  summarise(loan_mean = mean(loan_amnt))

home_amount
## # A tibble: 38 x 3
## # Groups:   new_issue_d [10]
##    new_issue_d         home_ownership loan_mean
##    <dttm>              <fct>              <dbl>
##  1 2007-01-20 00:00:00 MORTGAGE           9982.
##  2 2007-01-20 00:00:00 NONE               6714.
##  3 2007-01-20 00:00:00 OWN                7760.
##  4 2007-01-20 00:00:00 RENT               7309.
##  5 2008-01-20 00:00:00 MORTGAGE          10137.
##  6 2008-01-20 00:00:00 NONE              10000 
##  7 2008-01-20 00:00:00 OTHER              8159.
##  8 2008-01-20 00:00:00 OWN                8034.
##  9 2008-01-20 00:00:00 RENT               7977.
## 10 2009-01-20 00:00:00 MORTGAGE          10761.
## # ... with 28 more rows
ggplot(home_amount, aes(x=new_issue_d, y= loan_mean, 
                        color=home_ownership))+
  geom_line(size=2)+
  labs(x= "Issue Year", y="Amount")+
  theme_economist()

0.5.2 Loan_State & Loan

staus_table<-loan_cleaned%>%
  group_by(loan_status)%>%
  summarise(count=n())%>%
  mutate(Percent= round(count/sum(count)*100,2))

kable(staus_table, format="markdown")
loan_status count Percent
Charged Off 45248 5.10
Current 601778 67.82
Default 1219 0.14
Does not meet the credit policy. Status:Charged Off 761 0.09
Does not meet the credit policy. Status:Fully Paid 1988 0.22
Fully Paid 207723 23.41
In Grace Period 6253 0.70
Issued 8460 0.95
Late (16-30 days) 2357 0.27
Late (31-120 days) 11591 1.31
n 1 0.00
ggplot(staus_table, aes(reorder(loan_status, Percent), y=Percent, fill=loan_status))+
  geom_bar(stat="identity")+
  geom_text(aes(x=loan_status, y=0.05, label=paste0(Percent, "%"),vjust= .05, hjust=0.005))+
  theme_minimal()+
  coord_flip()

Using indicators, loan state are divide into either Good or Bad, adding the binary colunm.

bad_indicators <- c("Charged Off",
                    "Default",
                    "Does not meet the credit policy. Status:Charged Off",
                    "In Grace Period", 
                    "Default Receiver", 
                    "Late (16-30 days)",
                    "Late (31-120 days)")

loan_cleaned$Indicator <- ifelse(loan_cleaned$loan_status %in% bad_indicators, "Bad","Good")
loan_cleaned <- filter(loan_cleaned, loan_cleaned$Indicator == "Bad" |loan_cleaned$Indicator == "Good" )
loan_cleaned <- mutate(loan_cleaned, binary_status=as.factor(ifelse(loan_cleaned$Indicator =="Bad",0, 1)))

loan_cleaned$Indicator<-as.factor(loan_cleaned$Indicator)

summary(loan_cleaned$Indicator)
##    Bad   Good 
##  67429 819950
summary(loan_cleaned$binary_status)
##      0      1 
##  67429 819950
binary_statue<-loan_cleaned%>%
  group_by(binary_status)%>%
  summarise(count=n())%>%
  mutate(Percent= round(count/sum(count)*100,2))

kable(binary_statue)
binary_status count Percent
0 67429 7.6
1 819950 92.4
ggplot(binary_statue, aes(x=binary_status, Percent), y=Percent)+
  geom_bar(stat="identity",fill = "indianred3",color="black")+
  geom_text(aes(x= binary_status, y= Percent, 
            label=paste0(Percent, "%")),
            hjust=0.5, vjust=-0.25,size = 5, colour = 'black')+
  labs(x="Binary Loan State", y="Percent", "Good or Bad loan status")+
  scale_x_discrete(breaks= c("0","1"), labels=c("Bad", "Good") )+
  theme_bw()

0.5.3 Grade & Loan

table_1<-loan_cleaned%>%
  group_by(grade)%>%
  summarise(count=n())%>%
  mutate(Percent = round(count/sum(count)*100,2))%>%
  arrange(desc(Percent))

kable(table_1, format="markdown") 
grade count Percent
B 254535 28.68
C 245860 27.71
A 148202 16.70
D 139542 15.73
E 70705 7.97
F 23046 2.60
G 5489 0.62
ggplot(table_1, aes(x=reorder(grade, Percent), Percent, fill=grade))+
  geom_bar(stat = "identity", color="black")+
  geom_text(aes(x=grade, y=Percent, label=paste0(Percent, "%"),
                vjust=0.5, hjust=0.5))+
  coord_flip()+
  labs(x="Percent", y="Type of Grade", title= "Grade Rate of Loan")+
  theme_bw()

p3<-ggplot(loan_cleaned, aes(x=grade, fill=Indicator))+
  geom_bar(stat = "count", position = "fill", col="black")+
  scale_fill_brewer(palette = "Set2")+
  theme_bw()

Is grade related to a higher rate?

table_2 <-loan_cleaned%>% 
  group_by(grade)%>% 
  summarise(mean=round(mean(int_rate),2))

kable(table_2)
grade mean
A 7.24
B 10.83
C 13.98
D 17.18
E 19.90
F 23.58
G 25.63
p4<-ggplot(table_2, aes(x=reorder(grade), y=mean, fill=grade))+
  geom_bar(stat = "identity", color="black")+
  geom_text(aes(x=grade, y=mean, label=paste0(mean),vjust=1.3))+
  theme_bw()+
  labs(x="Credit Grade", y="Mean of interest rate")

grid.arrange(p3, p4, nrow=2)

신용 등급이 낮아질수록 채무 불이행의 수가 많아진다. 가장 대출을 많이 받는 B, C 는 평균 11 ~ 14 정도의 이율을 가진다.등급별 이자율을 확인하였을 때, 이자율이 20 이상인 경우, 채무 불이행 증가하는 것으로 나타난다.즉, 이율이 20 정도로 나타나는 등급의 고객은 채무 불이행의 가능성이 다른 등급보다 높다고 판단할 수 있다.

Term and Grade

대출 등급에 따른 대출 기간을 살펴보았을 때, B,C 등급 (평균 이자율 11~14)인 경우, 36 개월 대출기간 선호도가 높다.

plotdata_1 <-loan_cleaned%>%
  group_by(grade, term)%>%
  summarise(n=n())%>%
  mutate(pct= n/sum(n),
         lbl=scales::percent(pct))
  
kable(plotdata_1)
grade term n pct lbl
A 36 months 143015 0.9650005 96.5%
A 60 months 5187 0.0349995 3.5%
B 36 months 212009 0.8329267 83.3%
B 60 months 42526 0.1670733 16.7%
C 36 months 162420 0.6606199 66.1%
C 60 months 83440 0.3393801 33.9%
D 36 months 76687 0.5495621 54.96%
D 60 months 62855 0.4504379 45.04%
E 36 months 21842 0.3089173 30.9%
E 60 months 48863 0.6910827 69.1%
F 36 months 4466 0.1937863 19.4%
F 60 months 18580 0.8062137 80.6%
G 36 months 686 0.1249772 12.5%
G 60 months 4803 0.8750228 87.5%
ggplot(plotdata_1, aes(x=grade, y=pct, fill=term))+
  geom_bar(stat = "identity", position = "fill",col="black")+
  geom_text(aes(label=lbl),
            size=4, position= position_stack(vjust=0.5))+
  scale_fill_brewer(palette = "Set2")+
  theme_bw()+
  labs(x= "Grade", y="Percent",title="Grade and Term")

0.5.4 Employment Length & Loan

10 년 이상 재직한 사람들이 대출을 많이 받는다.

emp_table<-loan_cleaned%>%
  count(emp_length)%>%
  mutate(pct=n/sum(n),
         pctlabel=paste0(round(pct * 100), "%"))

kable(emp_table)
emp_length n pct pctlabel
< 1 year 70605 0.0795658 8%
1 year 57095 0.0643412 6%
10+ years 291569 0.3285732 33%
2 years 78870 0.0888797 9%
3 years 70025 0.0789122 8%
4 years 52529 0.0591957 6%
5 years 55704 0.0627736 6%
6 years 42950 0.0484010 5%
7 years 44594 0.0502536 5%
8 years 43955 0.0495335 5%
9 years 34657 0.0390555 4%
n/a 44825 0.0505139 5%
RENT 1 0.0000011 0%
ggplot(emp_table, aes(x=reorder(emp_length, n), y=n))+
         geom_bar(stat = "identity", fill="cornflowerblue", col="black")+
  geom_text(aes(label=pctlabel), vjust=.5, hjust=1.5, size=4)+
  coord_flip()+
  theme_bw()

  • Why? 10년 이상 재직한 사람들의 대출금과 대출 이자는 어느 정도일까?
year_10<-loan_cleaned%>%
  group_by(emp_length)%>%
  summarise(mean_loan = round(mean(loan_amnt),2),
            mean_int = round(mean(int_rate),2))

kable(year_10)
emp_length mean_loan mean_int
< 1 year 13798.11 13.13
1 year 13737.27 13.19
10+ years 16103.16 13.26
2 years 14015.69 13.19
3 years 14144.55 13.17
4 years 14263.24 13.21
5 years 14340.26 13.26
6 years 14555.09 13.40
7 years 14770.68 13.45
8 years 14971.16 13.26
9 years 15178.66 13.29
n/a 11777.18 13.28
RENT 10000.00 13.18
p5<-ggplot(year_10, aes(x=reorder(emp_length,mean_loan), y=mean_loan))+
  geom_bar(stat = "identity", fill="indianred3", alpha= 0.8, color= "black")+
  geom_text(aes(label=mean_loan), size=3, hjust=1.5)+
  theme_bw()+
  coord_flip()+
  labs(x="Amount of Loan", y="Period of employment", title= "The mean of loan by a lenght of employment")
  
p6<-ggplot(year_10, aes(x=reorder(emp_length,mean_int), y=mean_int))+
  geom_bar(stat = "identity", fill="blue", alpha= 0.4, color= "black")+
  geom_text(aes(label=paste0("$",mean_int),  hjust=1.5))+
  theme_bw()+
  coord_flip()+
  labs(x="Mena of intrest", y="Period of employment", title= "The amount of loan by a lenght of employment")
  
grid.arrange(p5, p6, nrow=2)

Default with respect to employee length - 재직 기간에 따른 채무 불이행 비율

ggplot(filter(loan_cleaned, emp_length !='n/a'), 
       aes(x=emp_length, fill=Indicator))+
  geom_bar(stat = "count", position = "stack", col="black")+
  labs(x= "Emplyment Length", y="Percent")+
  scale_fill_brewer(palette = "Set1")+
  theme_bw()

0.5.5 Home ownership & Loan

  • Mortage 소유자가 전체 대출의 50 % 를 차지하며, Rent 와 Own 가 나머지 50%를 차지한다.
Home_table<-loan_cleaned%>%
  count(home_ownership)%>%
  mutate(pct= n/sum(n),
         pctlabel= paste0(round(pct *100),"%"))%>%
  arrange(desc(n))

kable(Home_table)
home_ownership n pct pctlabel
MORTGAGE 443557 0.4998507 50%
RENT 356116 0.4013122 40%
OWN 87470 0.0985712 10%
OTHER 182 0.0002051 0%
NONE 50 0.0000563 0%
ANY 3 0.0000034 0%
65000.0 1 0.0000011 0%
ggplot(Home_table, aes(x=reorder(home_ownership,pct ), y=pct))+
  geom_bar(stat = "identity", fill="indianred3", alpha=0.8, color="black")+
  geom_text(aes(label=pctlabel), vjust=0.5, hjust=-0.5)+
  coord_flip()+
  theme_bw()+
  labs(x="Type of Home Ownership", y="")

  • Loan Status Rate by Home_Ownership
Home_status<-loan_cleaned%>%
  group_by(home_ownership, loan_status)%>%
  summarise(n=n())%>%
  mutate(pct= n/sum(n),
         lbl=scales::percent(pct))

kable(Home_status)
home_ownership loan_status n pct lbl
65000.0 n 1 1.0000000 100%
ANY Current 2 0.6666667 66.7%
ANY Fully Paid 1 0.3333333 33.3%
MORTGAGE Charged Off 19878 0.0448150 4.5%
MORTGAGE Current 303764 0.6848364 68.5%
MORTGAGE Default 498 0.0011227 0.1%
MORTGAGE Does not meet the credit policy. Status:Charged Off 348 0.0007846 0.1%
MORTGAGE Does not meet the credit policy. Status:Fully Paid 908 0.0020471 0.2%
MORTGAGE Fully Paid 104966 0.2366460 23.7%
MORTGAGE In Grace Period 2855 0.0064366 0.6%
MORTGAGE Issued 4220 0.0095140 1.0%
MORTGAGE Late (16-30 days) 1101 0.0024822 0.2%
MORTGAGE Late (31-120 days) 5019 0.0113153 1.1%
NONE Charged Off 7 0.1400000 14.0%
NONE Current 2 0.0400000 4.0%
NONE Does not meet the credit policy. Status:Charged Off 1 0.0200000 2.0%
NONE Does not meet the credit policy. Status:Fully Paid 4 0.0800000 8.0%
NONE Fully Paid 36 0.7200000 72.0%
OTHER Charged Off 27 0.1483516 14.8%
OTHER Current 3 0.0164835 1.6%
OTHER Does not meet the credit policy. Status:Charged Off 11 0.0604396 6.0%
OTHER Does not meet the credit policy. Status:Fully Paid 27 0.1483516 14.8%
OTHER Fully Paid 114 0.6263736 62.6%
OWN Charged Off 4025 0.0460158 4.6%
OWN Current 62041 0.7092832 70.9%
OWN Default 110 0.0012576 0.1%
OWN Does not meet the credit policy. Status:Charged Off 49 0.0005602 0.1%
OWN Does not meet the credit policy. Status:Fully Paid 138 0.0015777 0.2%
OWN Fully Paid 17960 0.2053275 20.5%
OWN In Grace Period 637 0.0072825 0.7%
OWN Issued 1038 0.0118669 1.2%
OWN Late (16-30 days) 260 0.0029724 0.3%
OWN Late (31-120 days) 1212 0.0138562 1.4%
RENT Charged Off 21311 0.0598429 6.0%
RENT Current 235966 0.6626099 66.3%
RENT Default 611 0.0017157 0.2%
RENT Does not meet the credit policy. Status:Charged Off 352 0.0009884 0.1%
RENT Does not meet the credit policy. Status:Fully Paid 911 0.0025582 0.3%
RENT Fully Paid 84646 0.2376922 23.8%
RENT In Grace Period 2761 0.0077531 0.8%
RENT Issued 3202 0.0089915 0.9%
RENT Late (16-30 days) 996 0.0027968 0.3%
RENT Late (31-120 days) 5360 0.0150513 1.5%
ggplot(Home_status, aes(x= home_ownership, y=pct, fill=loan_status))+
  geom_bar(stat = "identity", position = "stack", col="black")+
  geom_text(aes(label=lbl), position = position_stack(vjust=0.5), size=2.0)+
  theme_bw()+
  labs(x="Home Ownership", y="Percent")

  • Default Rate by Home_ownership
home_rate<-loan_cleaned%>%
  group_by(home_ownership)%>%
  count(Indicator)%>%
  mutate(pct= n/sum(n),
         lbl=scales:: percent(pct))

kable(home_rate)
home_ownership Indicator n pct lbl
65000.0 Good 1 1.0000000 100%
ANY Good 3 1.0000000 100%
MORTGAGE Bad 29699 0.0669564 6.7%
MORTGAGE Good 413858 0.9330436 93.3%
NONE Bad 8 0.1600000 16.0%
NONE Good 42 0.8400000 84.0%
OTHER Bad 38 0.2087912 20.9%
OTHER Good 144 0.7912088 79.1%
OWN Bad 6293 0.0719447 7.2%
OWN Good 81177 0.9280553 92.8%
RENT Bad 31391 0.0881482 8.8%
RENT Good 324725 0.9118518 91.2%
ggplot(home_rate, aes(x=home_ownership, y=n, fill=Indicator))+
  geom_bar(stat = "identity", position = "fill", col="black")+
  geom_text(aes(label=lbl), position=position_fill(vjust=0.5))+
  labs(x="Home Ownership", y="Percent")+
  theme_bw()

0.5.6 Vertification & Loan

  • Vertifaction Rate of Loan
V_Loan<-loan_cleaned%>%
  count(verification_status)%>%
  mutate(pct= n/sum(n),
         lbl= scales:: percent(pct))%>%
  arrange(desc(pct))


ggplot(V_Loan, aes(x=reorder(verification_status, pct), y=pct))+
  geom_bar(stat = "identity", fill="Indianred3", alpha=0.7, col="black")+
  geom_text(aes(label=lbl), vjust=-0.5)+
  coord_flip()+
  theme_bw()

  • Default rate by verification
ggplot(filter(loan_cleaned, verification_status !='Nov-2015'), aes(x=verification_status, fill=Indicator))+
  geom_bar(stat = "count", col="Black", position = "fill")+
  theme_bw()+
  labs(x="Verification Status", y="Percent")

  • The Mean of Loan amount by vertification and grade
Vergrade_loan<-loan_cleaned%>%
  group_by(verification_status, grade)%>%
  summarise(n=mean(loan_amnt))

kable(Vergrade_loan)
verification_status grade n
Not Verified A 11423.35
Not Verified B 11118.95
Not Verified C 10994.67
Not Verified D 10871.37
Not Verified E 12133.26
Not Verified F 11454.21
Not Verified G 14678.71
Nov-2015 C 10000.00
Source Verified A 15649.29
Source Verified B 14494.75
Source Verified C 14930.06
Source Verified D 16145.12
Source Verified E 17906.83
Source Verified F 18862.78
Source Verified G 19566.95
Verified A 16812.02
Verified B 15848.95
Verified C 16689.91
Verified D 17335.82
Verified E 19718.09
Verified F 20509.23
Verified G 21693.93
ggplot(filter(Vergrade_loan, verification_status !='Nov-2015'), aes(x=grade, y=n))+
  geom_histogram(stat = "identity", fill= "cornflowerblue")+
  facet_wrap(~verification_status)+
  labs(x="Grade", y=" Mean of loan")+
  theme_bw()

*Defualt rate by verfiation and grade

vergrade_indi<-loan_cleaned%>%
  group_by(verification_status, grade)%>%
  count(Indicator)

kable(vergrade_indi)
verification_status grade Indicator n
Not Verified A Bad 1561
Not Verified A Good 63501
Not Verified B Bad 5104
Not Verified B Good 87359
Not Verified C Bad 5326
Not Verified C Good 61715
Not Verified D Bad 3628
Not Verified D Good 26956
Not Verified E Bad 1259
Not Verified E Good 7917
Not Verified F Bad 397
Not Verified F Good 1635
Not Verified G Bad 119
Not Verified G Good 272
Nov-2015 C Good 1
Source Verified A Bad 1113
Source Verified A Good 50825
Source Verified B Bad 3935
Source Verified B Good 87743
Source Verified C Bad 6360
Source Verified C Good 87326
Source Verified D Bad 5647
Source Verified D Good 48200
Source Verified E Bad 3656
Source Verified E Good 25059
Source Verified F Bad 1444
Source Verified F Good 6558
Source Verified G Bad 383
Source Verified G Good 1309
Verified A Bad 989
Verified A Good 30213
Verified B Bad 4417
Verified B Good 65977
Verified C Bad 7368
Verified C Good 77764
Verified D Bad 6584
Verified D Good 48527
Verified E Bad 4830
Verified E Good 27984
Verified F Bad 2542
Verified F Good 10470
Verified G Bad 767
Verified G Good 2639
ggplot(filter(vergrade_indi, verification_status !='Nov-2015'),
        aes(x=verification_status, y= n, fill=Indicator))+
  geom_bar(stat = "identity",position = "stack")+
  facet_grid(.~grade)+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90,hjust = 1))

0.5.7 Purpose & Loan

p_table<-loan_cleaned%>%
  count(purpose)%>%
  mutate(pct=n/sum(n),
         lbl=scales::percent(pct))%>%
  arrange(desc(n))

kable(p_table)
purpose n pct lbl
debt_consolidation 524214 0.5907442 59.1%
credit_card 206181 0.2323483 23.2%
home_improvement 51829 0.0584068 5.8%
other 42894 0.0483379 4.8%
major_purchase 17277 0.0194697 1.9%
small_business 10377 0.0116940 1.2%
car 8863 0.0099878 1.0%
medical 8540 0.0096238 1.0%
moving 5414 0.0061011 0.6%
vacation 4736 0.0053371 0.5%
house 3707 0.0041775 0.4%
wedding 2347 0.0026449 0.3%
renewable_energy 575 0.0006480 0.1%
educational 423 0.0004767 0.0%
Debt consolidation 1 0.0000011 0.0%
Paying off my son’s dept… 1 0.0000011 0.0%
ggplot(filter(p_table, lbl >= 0.01),aes(x=reorder(purpose,pct), y=pct))+
  geom_bar(stat = "identity", fill="blue", alpha=0.7, col="white")+
  geom_text(aes(label=lbl), vjust= 0.5, hjust= -0.5)+
  coord_flip()+
  theme_bw()+
  labs(y="Percent", x="Purpose for loan")

  • The loan amount by purpose
ggplot(loan_cleaned, 
       aes(x=loan_amnt, y=purpose, fill=purpose))+
  geom_density_ridges()+
  theme_ridges()+
  theme(legend.position = "none")

  • Default rate by purpose of Loan
Dp_table <-loan_cleaned%>%
  group_by(purpose)%>%
  count(Indicator)%>%
  mutate(pct = n/sum(n),
         lbl= scales::percent(pct))%>%
  arrange(desc(n))

kable(Dp_table, format = "markdown")
purpose Indicator n pct lbl
debt_consolidation Good 482606 0.9206278 92.1%
credit_card Good 194426 0.9429870 94.3%
home_improvement Good 48276 0.9314476 93.1%
debt_consolidation Bad 41608 0.0793722 7.9%
other Good 38731 0.9029468 90.3%
major_purchase Good 15983 0.9251027 92.5%
credit_card Bad 11755 0.0570130 5.7%
small_business Good 8596 0.8283704 82.8%
car Good 8267 0.9327541 93.3%
medical Good 7736 0.9058548 90.6%
moving Good 4807 0.8878833 88.8%
vacation Good 4334 0.9151182 91.5%
other Bad 4163 0.0970532 9.7%
home_improvement Bad 3553 0.0685524 6.9%
house Good 3290 0.8875101 88.8%
wedding Good 2058 0.8768641 87.7%
small_business Bad 1781 0.1716296 17.2%
major_purchase Bad 1294 0.0748973 7.5%
medical Bad 804 0.0941452 9.4%
moving Bad 607 0.1121167 11.2%
car Bad 596 0.0672459 6.7%
renewable_energy Good 503 0.8747826 87.5%
house Bad 417 0.1124899 11.2%
vacation Bad 402 0.0848818 8.5%
educational Good 335 0.7919622 79.2%
wedding Bad 289 0.1231359 12.3%
educational Bad 88 0.2080378 20.8%
renewable_energy Bad 72 0.1252174 12.5%
Debt consolidation Good 1 1.0000000 100%
Paying off my son’s dept… Good 1 1.0000000 100%
ggplot(Dp_table, aes(x=reorder(purpose,pct) ,y=pct, fill=Indicator))+
  geom_bar(stat = "identity", col="black")+
  theme(axis.text.x = element_text(angle = 90,hjust = 1))+
  geom_text(aes(label=lbl), size= 3, position = position_stack(vjust=0.5))

  • Education and Small business
Edu_small <- subset(loan_cleaned, loan_cleaned$purpose== "educational" | loan_cleaned$purpose== "small_business")

funtable<-Edu_small%>%
  group_by(application_type, grade, term, Indicator)%>%
  count()


ggplot(funtable,aes(axis1 = application_type ,
                      axis2 = grade,
                        axis3 = Indicator ,
                          y = n)) +
  geom_alluvium(aes(fill = term)) +
  geom_stratum() +
  geom_text(stat = "stratum",label.strata = TRUE) 

  • Defualt rate by region and grade regarding customer with educational and small business purpose
a<-Edu_small%>%
  group_by(addr_state, grade, Indicator)%>%
  count(Indicator)%>%
  arrange(desc(n))

summary(a$Indicator)
##  Bad Good 
##  260  316
ggplot(a, aes(x=grade, fill=Indicator))+
  geom_bar(position = "stack")+
  facet_wrap(~addr_state)

0.5.8 State and Loan

  • Visualisation of state (Cateogrial variable) with Tree map
plotdata <- loan_cleaned%>%
  count(addr_state)

ggplot(plotdata, aes(fill=addr_state, 
                     area=n,
                     label=addr_state))+
  geom_treemap(col="black")+
  geom_treemap_text(color="white", place="center")+
  theme(legend.position = "none")

  • Loan Amount by States
ggplot(filter(loan_cleaned, addr_state != '22.32' & addr_state != '16.33'),aes(x=addr_state, y=loan_amnt, fill=addr_state))+
  stat_summary(fun.y="sum", geom="bar")+
  coord_flip()

  • Heat Map of loan amount in all state
loan_cleaned$region <- loan_cleaned$addr_state
loan_cleaned$region <- as.factor(loan_cleaned$region)

levels(loan_cleaned$region)<- c("1","2", "alaska", "alabama","arkansas", "arizona", "california","colorado","connecticut","district of columbia","delaware","florida","georgia","hawaii","iowa","idaho","illinois","indiana","kansas","kentucky","louisiana","massachusetts","maryland","maine","michigan","minnesota","missouri","mississippi","montana","north carolina","north dakota","nebraska","new hampshire","new jersey","new mexico","nevada","new york","ohio","oklahoma","oregon","pennsylvania","rhode island","south carolina","south dakota","tennessee","texas","utah","virginia","vermont","washington","wisconsin","west virginia","wyoming")


all_states <- map_data("state")

state_by_loan<-loan_cleaned%>%
  group_by(region)%>%
  summarise(value=sum(loan_amnt), na.rm=TRUE)

state_by_loan$region <- as.character(state_by_loan$region)
Total <- merge(all_states, state_by_loan, by="region")
head(Total,3)
##    region      long      lat group order subregion     value na.rm
## 1 alabama -87.46201 30.38968     1     1      <NA> 164627650  TRUE
## 2 alabama -87.48493 30.37249     1     2      <NA> 164627650  TRUE
## 3 alabama -87.52503 30.37249     1     3      <NA> 164627650  TRUE
p <- ggplot()
p <- p + geom_polygon(data=Total, aes(x=long, y=lat, group = group, fill=Total$value),colour="white"
      ) + scale_fill_continuous(low = "skyblue", high = "darkblue", guide="colorbar")
P1 <- p + theme_bw()  + labs(fill = "Gradient of loan amount" 
                            ,title = "Loan amount in all states", x="", y="")
P1 + scale_y_continuous(breaks=c()) + scale_x_continuous(breaks=c()) + theme(panel.border =  element_blank())

  • Loan count in each state
a=data.table(table(loan_cleaned$addr_state))
setnames(a,c("region","count"))



a$region=sapply(state.name[match(a$region, state.abb)],tolower)
all_states <- map_data("state")
Total <- merge(all_states, a, by="region")

head(Total,3)
##    region      long      lat group order subregion count
## 1 alabama -87.46201 30.38968     1     1      <NA> 11200
## 2 alabama -87.48493 30.37249     1     2      <NA> 11200
## 3 alabama -87.95475 30.24644     1    13      <NA> 11200
ggplot(Total, aes(x=long, y=lat, map_id = region)) + 
  geom_map(aes(fill= count), map = all_states)+
  labs(title="Loan counts in each state",x="",y="")+
  scale_fill_gradientn("",colours=terrain.colors(8),guide = "legend")+
  theme_bw()

0.5.9 Loan Amount & Loan default by Terms

ggplot(filter(loan_cleaned, verification_status !='Nov-2015'), 
       aes(x=loan_amnt, fill=Indicator))+
  geom_density()+
  facet_grid(verification_status ~ term)

0.5.10 Income & Loan default

  • Which state has the highest rate income?
all_states <- map_data("state")


Income_state<-loan_cleaned%>%
  group_by(region)%>%
  summarise(avg_income = mean(annual_inc))

t <- merge(all_states, Income_state, by="region")

head(t,3)
##    region      long      lat group order subregion avg_income
## 1 alabama -87.46201 30.38968     1     1      <NA>         NA
## 2 alabama -87.48493 30.37249     1     2      <NA>         NA
## 3 alabama -87.52503 30.37249     1     3      <NA>         NA
p3 <- ggplot()+
  geom_polygon(data=t,aes(x=long, y=lat, group=group, fill=t$avg_income),colour="white") +
   scale_fill_continuous(low = "skyblue", high = "darkblue", guide="colorbar")+
  ggtitle("Average income by State")

p3

  • Distribution of annaul income
Desc(loan_cleaned$annual_inc)
## ------------------------------------------------------------------------------ 
## loan_cleaned$annual_inc (factor)
## 
##    length       n     NAs  unique  levels   dupes
##   887'379 887'379       0  49'386  49'386       y
##            100.0%    0.0%                        
## 
##        level    freq  perc  cumfreq  cumperc
## 1    60000.0  34'281  3.9%   34'281     3.9%
## 2    50000.0  30'575  3.4%   64'856     7.3%
## 3    65000.0  25'497  2.9%   90'353    10.2%
## 4    70000.0  24'121  2.7%  114'474    12.9%
## 5    40000.0  23'943  2.7%  138'417    15.6%
## 6    80000.0  22'729  2.6%  161'146    18.2%
## 7    45000.0  22'699  2.6%  183'845    20.7%
## 8    75000.0  22'435  2.5%  206'280    23.2%
## 9    55000.0  20'755  2.3%  227'035    25.6%
## 10   90000.0  17'159  1.9%  244'194    27.5%
## 11  100000.0  17'131  1.9%  261'325    29.4%
## 12   85000.0  15'648  1.8%  276'973    31.2%
## ... etc.
##  [list output truncated]

loan_cleaned$annual_inc <-as.numeric(loan_cleaned$annual_in)
loan_cleaned <- mutate(loan_cleaned, inc_grp = annual_inc)


loan_cleaned$inc_grp[loan_cleaned$annual_inc <= 5000] <- "verylow_inc"
loan_cleaned$inc_grp[loan_cleaned$annual_inc > 5000 & loan_cleaned$annual_inc <= 8000] <- 'low_inc'
loan_cleaned$inc_grp[loan_cleaned$annual_inc > 8000 & loan_cleaned$annual_inc <= 12000] <- 'middle_inc'
loan_cleaned$inc_grp[loan_cleaned$annual_inc > 12000 & loan_cleaned$annual_inc <= 24000] <- 'high_inc'
loan_cleaned$inc_grp[loan_cleaned$annual_inc > 24000] <- 'veryhigh_inc'



loan_cleaned$inc_grp <-as.factor(loan_cleaned$inc_grp)
summary(loan_cleaned$inc_grp)
##     high_inc      low_inc   middle_inc veryhigh_inc  verylow_inc 
##       205410        52085        49791       481830        98263

소득이 높은 그룹이 대출을 자주 하는것으로 나타난다.

pt1<-loan_cleaned%>%
  group_by(inc_grp)%>%
  summarise(count=n())%>%
  mutate(pct = count/sum(count),
         lbl= scales:: percent(pct))
kable(pt1)
inc_grp count pct lbl
high_inc 205410 0.2314794 23.1%
low_inc 52085 0.0586953 5.9%
middle_inc 49791 0.0561102 5.6%
veryhigh_inc 481830 0.5429811 54.3%
verylow_inc 98263 0.1107340 11.1%
ggplot(pt1, aes(x=reorder(inc_grp, pct), y=pct))+
  geom_bar(stat = "identity", fill="indianred3", alpha= 0.9,  color="black")+
  geom_text(aes(label=lbl), vjust=-0.25, hjust=0.5)+
  coord_flip()+
  theme_bw()+
  labs(x="Gruop of Income", y="Percent")

  • Default rate by income groups 소득이 높은 그룹이 채무 불 이행률이 가장 높은 것으로 나타났다. 다른 수입군에 비해 이자율의 차이는 없어보이며, 소득이 많은 그룹의 채무 불이행률에 대해서 이자율과는 낮은 상관이 있는것 같아 보인다.
ggplot(loan_cleaned, aes(x=reorder(inc_grp, annual_inc), fill=Indicator))+
             geom_bar(position = "fill")+
  theme_bw()

ggplot(loan_cleaned, aes(y= inc_grp, x=int_rate, fill=inc_grp))+
  geom_density_ridges()+
  theme_ridges()