EDA
Exploratory Data Analysis EDA results are displayed with GGplot visualisation, through which the data interpretation are described.
Issued_date & Loan
The total amount of loan and the mean of loan are shown by year variables.
loan_cleaned$issue_year <- substr(loan_cleaned$issue_d, 5,8)
loan_cleaned$new_issue_d <- parse_date_time(gsub("^", "01-", loan_cleaned$issue_d), orders = c("d-m-y", "d-B-Y", "m/d/y"))
Table_1<-loan_cleaned%>%
select(new_issue_d, loan_amnt)%>%
group_by(new_issue_d)%>%
summarise(amount=sum(loan_amnt))
Table_1 <-na.omit(Table_1)
Table_1
## # A tibble: 9 x 2
## new_issue_d amount
## <dttm> <dbl>
## 1 2007-01-20 00:00:00 4977475
## 2 2008-01-20 00:00:00 21119250
## 3 2009-01-20 00:00:00 51928250
## 4 2010-01-20 00:00:00 131992550
## 5 2011-01-20 00:00:00 261683825
## 6 2012-01-20 00:00:00 718411025
## 7 2013-01-20 00:00:00 1981989225
## 8 2014-01-20 00:00:00 3503830175
## 9 2015-01-20 00:00:00 6417570175
Table_2 <-loan_cleaned%>%
select(new_issue_d, loan_amnt)%>%
group_by(new_issue_d)%>%
summarise(m=mean(loan_amnt))
Table_2 <-na.omit(Table_2)
Table_2
## # A tibble: 9 x 2
## new_issue_d m
## <dttm> <dbl>
## 1 2007-01-20 00:00:00 8255.
## 2 2008-01-20 00:00:00 8825.
## 3 2009-01-20 00:00:00 9833.
## 4 2010-01-20 00:00:00 10528.
## 5 2011-01-20 00:00:00 12048.
## 6 2012-01-20 00:00:00 13462.
## 7 2013-01-20 00:00:00 14708.
## 8 2014-01-20 00:00:00 14870.
## 9 2015-01-20 00:00:00 15240.
Table_3<-merge(Table_1, Table_2, by='new_issue_d', all=FALSE)
kable(Table_3)
| 2007-01-20 |
4977475 |
8254.519 |
| 2008-01-20 |
21119250 |
8825.428 |
| 2009-01-20 |
51928250 |
9833.034 |
| 2010-01-20 |
131992550 |
10528.240 |
| 2011-01-20 |
261683825 |
12047.504 |
| 2012-01-20 |
718411025 |
13461.709 |
| 2013-01-20 |
1981989225 |
14708.094 |
| 2014-01-20 |
3503830175 |
14870.177 |
| 2015-01-20 |
6417570175 |
15240.268 |
p1<-ggplot(Table_3, aes(x=new_issue_d, y=amount))+
geom_line(color="indianred3", size=1)+
scale_y_continuous(labels = scales::comma)+
geom_point(size=2)+
labs(y="Amount of Loan", x="Year",
title = "Amount of Loan by Year")+
theme_economist()
p2<-ggplot(Table_3, aes(x=new_issue_d, y=m))+
geom_line(color="lightgrey", size=1)+
scale_y_continuous(labels = scales::comma)+
geom_point(size=2, col="blue")+
labs(y="Mean of Loan", x="Year",
title = " Mean of Loan by Year")+
theme_economist()
grid.arrange(p1,p2, nrow=2)

- The mean of loan amount by year and purpose
Mean_purpose <-loan_cleaned%>%
group_by(new_issue_d, purpose)%>%
summarise(loan_mean = mean(loan_amnt))
ggplot(Mean_purpose, aes(x=new_issue_d, y= loan_mean, color=purpose))+
geom_line(size=1.5)+
theme_economist()

- The Mean of loan amount by year and grade
Mean_year_grade<-loan_cleaned%>%
group_by(new_issue_d, grade)%>%
summarise(loan_mean = mean(loan_amnt))
Mean_year_grade
## # A tibble: 64 x 3
## # Groups: new_issue_d [10]
## new_issue_d grade loan_mean
## <dttm> <fct> <dbl>
## 1 2007-01-20 00:00:00 A 5015.
## 2 2007-01-20 00:00:00 B 8338.
## 3 2007-01-20 00:00:00 C 8276.
## 4 2007-01-20 00:00:00 D 8006.
## 5 2007-01-20 00:00:00 E 8138.
## 6 2007-01-20 00:00:00 F 11851.
## 7 2007-01-20 00:00:00 G 10848.
## 8 2008-01-20 00:00:00 A 6235.
## 9 2008-01-20 00:00:00 B 9175.
## 10 2008-01-20 00:00:00 C 8840.
## # ... with 54 more rows
ggplot(Mean_year_grade, aes(x=new_issue_d, y= loan_mean, color=grade))+
geom_line(size=1)+
theme_economist()

- The Mean of loan amount by Home ownership
home_amount<-loan_cleaned%>%
group_by(new_issue_d, home_ownership)%>%
summarise(loan_mean = mean(loan_amnt))
home_amount
## # A tibble: 38 x 3
## # Groups: new_issue_d [10]
## new_issue_d home_ownership loan_mean
## <dttm> <fct> <dbl>
## 1 2007-01-20 00:00:00 MORTGAGE 9982.
## 2 2007-01-20 00:00:00 NONE 6714.
## 3 2007-01-20 00:00:00 OWN 7760.
## 4 2007-01-20 00:00:00 RENT 7309.
## 5 2008-01-20 00:00:00 MORTGAGE 10137.
## 6 2008-01-20 00:00:00 NONE 10000
## 7 2008-01-20 00:00:00 OTHER 8159.
## 8 2008-01-20 00:00:00 OWN 8034.
## 9 2008-01-20 00:00:00 RENT 7977.
## 10 2009-01-20 00:00:00 MORTGAGE 10761.
## # ... with 28 more rows
ggplot(home_amount, aes(x=new_issue_d, y= loan_mean,
color=home_ownership))+
geom_line(size=2)+
labs(x= "Issue Year", y="Amount")+
theme_economist()

Loan_State & Loan
staus_table<-loan_cleaned%>%
group_by(loan_status)%>%
summarise(count=n())%>%
mutate(Percent= round(count/sum(count)*100,2))
kable(staus_table, format="markdown")
| Charged Off |
45248 |
5.10 |
| Current |
601778 |
67.82 |
| Default |
1219 |
0.14 |
| Does not meet the credit policy. Status:Charged Off |
761 |
0.09 |
| Does not meet the credit policy. Status:Fully Paid |
1988 |
0.22 |
| Fully Paid |
207723 |
23.41 |
| In Grace Period |
6253 |
0.70 |
| Issued |
8460 |
0.95 |
| Late (16-30 days) |
2357 |
0.27 |
| Late (31-120 days) |
11591 |
1.31 |
| n |
1 |
0.00 |
ggplot(staus_table, aes(reorder(loan_status, Percent), y=Percent, fill=loan_status))+
geom_bar(stat="identity")+
geom_text(aes(x=loan_status, y=0.05, label=paste0(Percent, "%"),vjust= .05, hjust=0.005))+
theme_minimal()+
coord_flip()

Using indicators, loan state are divide into either Good or Bad, adding the binary colunm.
bad_indicators <- c("Charged Off",
"Default",
"Does not meet the credit policy. Status:Charged Off",
"In Grace Period",
"Default Receiver",
"Late (16-30 days)",
"Late (31-120 days)")
loan_cleaned$Indicator <- ifelse(loan_cleaned$loan_status %in% bad_indicators, "Bad","Good")
loan_cleaned <- filter(loan_cleaned, loan_cleaned$Indicator == "Bad" |loan_cleaned$Indicator == "Good" )
loan_cleaned <- mutate(loan_cleaned, binary_status=as.factor(ifelse(loan_cleaned$Indicator =="Bad",0, 1)))
loan_cleaned$Indicator<-as.factor(loan_cleaned$Indicator)
summary(loan_cleaned$Indicator)
## Bad Good
## 67429 819950
summary(loan_cleaned$binary_status)
## 0 1
## 67429 819950
binary_statue<-loan_cleaned%>%
group_by(binary_status)%>%
summarise(count=n())%>%
mutate(Percent= round(count/sum(count)*100,2))
kable(binary_statue)
| 0 |
67429 |
7.6 |
| 1 |
819950 |
92.4 |
ggplot(binary_statue, aes(x=binary_status, Percent), y=Percent)+
geom_bar(stat="identity",fill = "indianred3",color="black")+
geom_text(aes(x= binary_status, y= Percent,
label=paste0(Percent, "%")),
hjust=0.5, vjust=-0.25,size = 5, colour = 'black')+
labs(x="Binary Loan State", y="Percent", "Good or Bad loan status")+
scale_x_discrete(breaks= c("0","1"), labels=c("Bad", "Good") )+
theme_bw()

Grade & Loan
table_1<-loan_cleaned%>%
group_by(grade)%>%
summarise(count=n())%>%
mutate(Percent = round(count/sum(count)*100,2))%>%
arrange(desc(Percent))
kable(table_1, format="markdown")
| B |
254535 |
28.68 |
| C |
245860 |
27.71 |
| A |
148202 |
16.70 |
| D |
139542 |
15.73 |
| E |
70705 |
7.97 |
| F |
23046 |
2.60 |
| G |
5489 |
0.62 |
ggplot(table_1, aes(x=reorder(grade, Percent), Percent, fill=grade))+
geom_bar(stat = "identity", color="black")+
geom_text(aes(x=grade, y=Percent, label=paste0(Percent, "%"),
vjust=0.5, hjust=0.5))+
coord_flip()+
labs(x="Percent", y="Type of Grade", title= "Grade Rate of Loan")+
theme_bw()

p3<-ggplot(loan_cleaned, aes(x=grade, fill=Indicator))+
geom_bar(stat = "count", position = "fill", col="black")+
scale_fill_brewer(palette = "Set2")+
theme_bw()
Is grade related to a higher rate?
table_2 <-loan_cleaned%>%
group_by(grade)%>%
summarise(mean=round(mean(int_rate),2))
kable(table_2)
| A |
7.24 |
| B |
10.83 |
| C |
13.98 |
| D |
17.18 |
| E |
19.90 |
| F |
23.58 |
| G |
25.63 |
p4<-ggplot(table_2, aes(x=reorder(grade), y=mean, fill=grade))+
geom_bar(stat = "identity", color="black")+
geom_text(aes(x=grade, y=mean, label=paste0(mean),vjust=1.3))+
theme_bw()+
labs(x="Credit Grade", y="Mean of interest rate")
grid.arrange(p3, p4, nrow=2)

신용 등급이 낮아질수록 채무 불이행의 수가 많아진다. 가장 대출을 많이 받는 B, C 는 평균 11 ~ 14 정도의 이율을 가진다.등급별 이자율을 확인하였을 때, 이자율이 20 이상인 경우, 채무 불이행 증가하는 것으로 나타난다.즉, 이율이 20 정도로 나타나는 등급의 고객은 채무 불이행의 가능성이 다른 등급보다 높다고 판단할 수 있다.
Term and Grade
대출 등급에 따른 대출 기간을 살펴보았을 때, B,C 등급 (평균 이자율 11~14)인 경우, 36 개월 대출기간 선호도가 높다.
plotdata_1 <-loan_cleaned%>%
group_by(grade, term)%>%
summarise(n=n())%>%
mutate(pct= n/sum(n),
lbl=scales::percent(pct))
kable(plotdata_1)
| A |
36 months |
143015 |
0.9650005 |
96.5% |
| A |
60 months |
5187 |
0.0349995 |
3.5% |
| B |
36 months |
212009 |
0.8329267 |
83.3% |
| B |
60 months |
42526 |
0.1670733 |
16.7% |
| C |
36 months |
162420 |
0.6606199 |
66.1% |
| C |
60 months |
83440 |
0.3393801 |
33.9% |
| D |
36 months |
76687 |
0.5495621 |
54.96% |
| D |
60 months |
62855 |
0.4504379 |
45.04% |
| E |
36 months |
21842 |
0.3089173 |
30.9% |
| E |
60 months |
48863 |
0.6910827 |
69.1% |
| F |
36 months |
4466 |
0.1937863 |
19.4% |
| F |
60 months |
18580 |
0.8062137 |
80.6% |
| G |
36 months |
686 |
0.1249772 |
12.5% |
| G |
60 months |
4803 |
0.8750228 |
87.5% |
ggplot(plotdata_1, aes(x=grade, y=pct, fill=term))+
geom_bar(stat = "identity", position = "fill",col="black")+
geom_text(aes(label=lbl),
size=4, position= position_stack(vjust=0.5))+
scale_fill_brewer(palette = "Set2")+
theme_bw()+
labs(x= "Grade", y="Percent",title="Grade and Term")

Employment Length & Loan
10 년 이상 재직한 사람들이 대출을 많이 받는다.
emp_table<-loan_cleaned%>%
count(emp_length)%>%
mutate(pct=n/sum(n),
pctlabel=paste0(round(pct * 100), "%"))
kable(emp_table)
| < 1 year |
70605 |
0.0795658 |
8% |
| 1 year |
57095 |
0.0643412 |
6% |
| 10+ years |
291569 |
0.3285732 |
33% |
| 2 years |
78870 |
0.0888797 |
9% |
| 3 years |
70025 |
0.0789122 |
8% |
| 4 years |
52529 |
0.0591957 |
6% |
| 5 years |
55704 |
0.0627736 |
6% |
| 6 years |
42950 |
0.0484010 |
5% |
| 7 years |
44594 |
0.0502536 |
5% |
| 8 years |
43955 |
0.0495335 |
5% |
| 9 years |
34657 |
0.0390555 |
4% |
| n/a |
44825 |
0.0505139 |
5% |
| RENT |
1 |
0.0000011 |
0% |
ggplot(emp_table, aes(x=reorder(emp_length, n), y=n))+
geom_bar(stat = "identity", fill="cornflowerblue", col="black")+
geom_text(aes(label=pctlabel), vjust=.5, hjust=1.5, size=4)+
coord_flip()+
theme_bw()

- Why? 10년 이상 재직한 사람들의 대출금과 대출 이자는 어느 정도일까?
year_10<-loan_cleaned%>%
group_by(emp_length)%>%
summarise(mean_loan = round(mean(loan_amnt),2),
mean_int = round(mean(int_rate),2))
kable(year_10)
| < 1 year |
13798.11 |
13.13 |
| 1 year |
13737.27 |
13.19 |
| 10+ years |
16103.16 |
13.26 |
| 2 years |
14015.69 |
13.19 |
| 3 years |
14144.55 |
13.17 |
| 4 years |
14263.24 |
13.21 |
| 5 years |
14340.26 |
13.26 |
| 6 years |
14555.09 |
13.40 |
| 7 years |
14770.68 |
13.45 |
| 8 years |
14971.16 |
13.26 |
| 9 years |
15178.66 |
13.29 |
| n/a |
11777.18 |
13.28 |
| RENT |
10000.00 |
13.18 |
p5<-ggplot(year_10, aes(x=reorder(emp_length,mean_loan), y=mean_loan))+
geom_bar(stat = "identity", fill="indianred3", alpha= 0.8, color= "black")+
geom_text(aes(label=mean_loan), size=3, hjust=1.5)+
theme_bw()+
coord_flip()+
labs(x="Amount of Loan", y="Period of employment", title= "The mean of loan by a lenght of employment")
p6<-ggplot(year_10, aes(x=reorder(emp_length,mean_int), y=mean_int))+
geom_bar(stat = "identity", fill="blue", alpha= 0.4, color= "black")+
geom_text(aes(label=paste0("$",mean_int), hjust=1.5))+
theme_bw()+
coord_flip()+
labs(x="Mena of intrest", y="Period of employment", title= "The amount of loan by a lenght of employment")
grid.arrange(p5, p6, nrow=2)

Default with respect to employee length - 재직 기간에 따른 채무 불이행 비율
ggplot(filter(loan_cleaned, emp_length !='n/a'),
aes(x=emp_length, fill=Indicator))+
geom_bar(stat = "count", position = "stack", col="black")+
labs(x= "Emplyment Length", y="Percent")+
scale_fill_brewer(palette = "Set1")+
theme_bw()

Home ownership & Loan
- Mortage 소유자가 전체 대출의 50 % 를 차지하며, Rent 와 Own 가 나머지 50%를 차지한다.
Home_table<-loan_cleaned%>%
count(home_ownership)%>%
mutate(pct= n/sum(n),
pctlabel= paste0(round(pct *100),"%"))%>%
arrange(desc(n))
kable(Home_table)
| MORTGAGE |
443557 |
0.4998507 |
50% |
| RENT |
356116 |
0.4013122 |
40% |
| OWN |
87470 |
0.0985712 |
10% |
| OTHER |
182 |
0.0002051 |
0% |
| NONE |
50 |
0.0000563 |
0% |
| ANY |
3 |
0.0000034 |
0% |
| 65000.0 |
1 |
0.0000011 |
0% |
ggplot(Home_table, aes(x=reorder(home_ownership,pct ), y=pct))+
geom_bar(stat = "identity", fill="indianred3", alpha=0.8, color="black")+
geom_text(aes(label=pctlabel), vjust=0.5, hjust=-0.5)+
coord_flip()+
theme_bw()+
labs(x="Type of Home Ownership", y="")

- Loan Status Rate by Home_Ownership
Home_status<-loan_cleaned%>%
group_by(home_ownership, loan_status)%>%
summarise(n=n())%>%
mutate(pct= n/sum(n),
lbl=scales::percent(pct))
kable(Home_status)
| 65000.0 |
n |
1 |
1.0000000 |
100% |
| ANY |
Current |
2 |
0.6666667 |
66.7% |
| ANY |
Fully Paid |
1 |
0.3333333 |
33.3% |
| MORTGAGE |
Charged Off |
19878 |
0.0448150 |
4.5% |
| MORTGAGE |
Current |
303764 |
0.6848364 |
68.5% |
| MORTGAGE |
Default |
498 |
0.0011227 |
0.1% |
| MORTGAGE |
Does not meet the credit policy. Status:Charged Off |
348 |
0.0007846 |
0.1% |
| MORTGAGE |
Does not meet the credit policy. Status:Fully Paid |
908 |
0.0020471 |
0.2% |
| MORTGAGE |
Fully Paid |
104966 |
0.2366460 |
23.7% |
| MORTGAGE |
In Grace Period |
2855 |
0.0064366 |
0.6% |
| MORTGAGE |
Issued |
4220 |
0.0095140 |
1.0% |
| MORTGAGE |
Late (16-30 days) |
1101 |
0.0024822 |
0.2% |
| MORTGAGE |
Late (31-120 days) |
5019 |
0.0113153 |
1.1% |
| NONE |
Charged Off |
7 |
0.1400000 |
14.0% |
| NONE |
Current |
2 |
0.0400000 |
4.0% |
| NONE |
Does not meet the credit policy. Status:Charged Off |
1 |
0.0200000 |
2.0% |
| NONE |
Does not meet the credit policy. Status:Fully Paid |
4 |
0.0800000 |
8.0% |
| NONE |
Fully Paid |
36 |
0.7200000 |
72.0% |
| OTHER |
Charged Off |
27 |
0.1483516 |
14.8% |
| OTHER |
Current |
3 |
0.0164835 |
1.6% |
| OTHER |
Does not meet the credit policy. Status:Charged Off |
11 |
0.0604396 |
6.0% |
| OTHER |
Does not meet the credit policy. Status:Fully Paid |
27 |
0.1483516 |
14.8% |
| OTHER |
Fully Paid |
114 |
0.6263736 |
62.6% |
| OWN |
Charged Off |
4025 |
0.0460158 |
4.6% |
| OWN |
Current |
62041 |
0.7092832 |
70.9% |
| OWN |
Default |
110 |
0.0012576 |
0.1% |
| OWN |
Does not meet the credit policy. Status:Charged Off |
49 |
0.0005602 |
0.1% |
| OWN |
Does not meet the credit policy. Status:Fully Paid |
138 |
0.0015777 |
0.2% |
| OWN |
Fully Paid |
17960 |
0.2053275 |
20.5% |
| OWN |
In Grace Period |
637 |
0.0072825 |
0.7% |
| OWN |
Issued |
1038 |
0.0118669 |
1.2% |
| OWN |
Late (16-30 days) |
260 |
0.0029724 |
0.3% |
| OWN |
Late (31-120 days) |
1212 |
0.0138562 |
1.4% |
| RENT |
Charged Off |
21311 |
0.0598429 |
6.0% |
| RENT |
Current |
235966 |
0.6626099 |
66.3% |
| RENT |
Default |
611 |
0.0017157 |
0.2% |
| RENT |
Does not meet the credit policy. Status:Charged Off |
352 |
0.0009884 |
0.1% |
| RENT |
Does not meet the credit policy. Status:Fully Paid |
911 |
0.0025582 |
0.3% |
| RENT |
Fully Paid |
84646 |
0.2376922 |
23.8% |
| RENT |
In Grace Period |
2761 |
0.0077531 |
0.8% |
| RENT |
Issued |
3202 |
0.0089915 |
0.9% |
| RENT |
Late (16-30 days) |
996 |
0.0027968 |
0.3% |
| RENT |
Late (31-120 days) |
5360 |
0.0150513 |
1.5% |
ggplot(Home_status, aes(x= home_ownership, y=pct, fill=loan_status))+
geom_bar(stat = "identity", position = "stack", col="black")+
geom_text(aes(label=lbl), position = position_stack(vjust=0.5), size=2.0)+
theme_bw()+
labs(x="Home Ownership", y="Percent")

- Default Rate by Home_ownership
home_rate<-loan_cleaned%>%
group_by(home_ownership)%>%
count(Indicator)%>%
mutate(pct= n/sum(n),
lbl=scales:: percent(pct))
kable(home_rate)
| 65000.0 |
Good |
1 |
1.0000000 |
100% |
| ANY |
Good |
3 |
1.0000000 |
100% |
| MORTGAGE |
Bad |
29699 |
0.0669564 |
6.7% |
| MORTGAGE |
Good |
413858 |
0.9330436 |
93.3% |
| NONE |
Bad |
8 |
0.1600000 |
16.0% |
| NONE |
Good |
42 |
0.8400000 |
84.0% |
| OTHER |
Bad |
38 |
0.2087912 |
20.9% |
| OTHER |
Good |
144 |
0.7912088 |
79.1% |
| OWN |
Bad |
6293 |
0.0719447 |
7.2% |
| OWN |
Good |
81177 |
0.9280553 |
92.8% |
| RENT |
Bad |
31391 |
0.0881482 |
8.8% |
| RENT |
Good |
324725 |
0.9118518 |
91.2% |
ggplot(home_rate, aes(x=home_ownership, y=n, fill=Indicator))+
geom_bar(stat = "identity", position = "fill", col="black")+
geom_text(aes(label=lbl), position=position_fill(vjust=0.5))+
labs(x="Home Ownership", y="Percent")+
theme_bw()

Vertification & Loan
- Vertifaction Rate of Loan
V_Loan<-loan_cleaned%>%
count(verification_status)%>%
mutate(pct= n/sum(n),
lbl= scales:: percent(pct))%>%
arrange(desc(pct))
ggplot(V_Loan, aes(x=reorder(verification_status, pct), y=pct))+
geom_bar(stat = "identity", fill="Indianred3", alpha=0.7, col="black")+
geom_text(aes(label=lbl), vjust=-0.5)+
coord_flip()+
theme_bw()

- Default rate by verification
ggplot(filter(loan_cleaned, verification_status !='Nov-2015'), aes(x=verification_status, fill=Indicator))+
geom_bar(stat = "count", col="Black", position = "fill")+
theme_bw()+
labs(x="Verification Status", y="Percent")

- The Mean of Loan amount by vertification and grade
Vergrade_loan<-loan_cleaned%>%
group_by(verification_status, grade)%>%
summarise(n=mean(loan_amnt))
kable(Vergrade_loan)
| Not Verified |
A |
11423.35 |
| Not Verified |
B |
11118.95 |
| Not Verified |
C |
10994.67 |
| Not Verified |
D |
10871.37 |
| Not Verified |
E |
12133.26 |
| Not Verified |
F |
11454.21 |
| Not Verified |
G |
14678.71 |
| Nov-2015 |
C |
10000.00 |
| Source Verified |
A |
15649.29 |
| Source Verified |
B |
14494.75 |
| Source Verified |
C |
14930.06 |
| Source Verified |
D |
16145.12 |
| Source Verified |
E |
17906.83 |
| Source Verified |
F |
18862.78 |
| Source Verified |
G |
19566.95 |
| Verified |
A |
16812.02 |
| Verified |
B |
15848.95 |
| Verified |
C |
16689.91 |
| Verified |
D |
17335.82 |
| Verified |
E |
19718.09 |
| Verified |
F |
20509.23 |
| Verified |
G |
21693.93 |
ggplot(filter(Vergrade_loan, verification_status !='Nov-2015'), aes(x=grade, y=n))+
geom_histogram(stat = "identity", fill= "cornflowerblue")+
facet_wrap(~verification_status)+
labs(x="Grade", y=" Mean of loan")+
theme_bw()

*Defualt rate by verfiation and grade
vergrade_indi<-loan_cleaned%>%
group_by(verification_status, grade)%>%
count(Indicator)
kable(vergrade_indi)
| Not Verified |
A |
Bad |
1561 |
| Not Verified |
A |
Good |
63501 |
| Not Verified |
B |
Bad |
5104 |
| Not Verified |
B |
Good |
87359 |
| Not Verified |
C |
Bad |
5326 |
| Not Verified |
C |
Good |
61715 |
| Not Verified |
D |
Bad |
3628 |
| Not Verified |
D |
Good |
26956 |
| Not Verified |
E |
Bad |
1259 |
| Not Verified |
E |
Good |
7917 |
| Not Verified |
F |
Bad |
397 |
| Not Verified |
F |
Good |
1635 |
| Not Verified |
G |
Bad |
119 |
| Not Verified |
G |
Good |
272 |
| Nov-2015 |
C |
Good |
1 |
| Source Verified |
A |
Bad |
1113 |
| Source Verified |
A |
Good |
50825 |
| Source Verified |
B |
Bad |
3935 |
| Source Verified |
B |
Good |
87743 |
| Source Verified |
C |
Bad |
6360 |
| Source Verified |
C |
Good |
87326 |
| Source Verified |
D |
Bad |
5647 |
| Source Verified |
D |
Good |
48200 |
| Source Verified |
E |
Bad |
3656 |
| Source Verified |
E |
Good |
25059 |
| Source Verified |
F |
Bad |
1444 |
| Source Verified |
F |
Good |
6558 |
| Source Verified |
G |
Bad |
383 |
| Source Verified |
G |
Good |
1309 |
| Verified |
A |
Bad |
989 |
| Verified |
A |
Good |
30213 |
| Verified |
B |
Bad |
4417 |
| Verified |
B |
Good |
65977 |
| Verified |
C |
Bad |
7368 |
| Verified |
C |
Good |
77764 |
| Verified |
D |
Bad |
6584 |
| Verified |
D |
Good |
48527 |
| Verified |
E |
Bad |
4830 |
| Verified |
E |
Good |
27984 |
| Verified |
F |
Bad |
2542 |
| Verified |
F |
Good |
10470 |
| Verified |
G |
Bad |
767 |
| Verified |
G |
Good |
2639 |
ggplot(filter(vergrade_indi, verification_status !='Nov-2015'),
aes(x=verification_status, y= n, fill=Indicator))+
geom_bar(stat = "identity",position = "stack")+
facet_grid(.~grade)+
theme_bw()+
theme(axis.text.x = element_text(angle = 90,hjust = 1))

Purpose & Loan
p_table<-loan_cleaned%>%
count(purpose)%>%
mutate(pct=n/sum(n),
lbl=scales::percent(pct))%>%
arrange(desc(n))
kable(p_table)
| debt_consolidation |
524214 |
0.5907442 |
59.1% |
| credit_card |
206181 |
0.2323483 |
23.2% |
| home_improvement |
51829 |
0.0584068 |
5.8% |
| other |
42894 |
0.0483379 |
4.8% |
| major_purchase |
17277 |
0.0194697 |
1.9% |
| small_business |
10377 |
0.0116940 |
1.2% |
| car |
8863 |
0.0099878 |
1.0% |
| medical |
8540 |
0.0096238 |
1.0% |
| moving |
5414 |
0.0061011 |
0.6% |
| vacation |
4736 |
0.0053371 |
0.5% |
| house |
3707 |
0.0041775 |
0.4% |
| wedding |
2347 |
0.0026449 |
0.3% |
| renewable_energy |
575 |
0.0006480 |
0.1% |
| educational |
423 |
0.0004767 |
0.0% |
| Debt consolidation |
1 |
0.0000011 |
0.0% |
| Paying off my son’s dept… |
1 |
0.0000011 |
0.0% |
ggplot(filter(p_table, lbl >= 0.01),aes(x=reorder(purpose,pct), y=pct))+
geom_bar(stat = "identity", fill="blue", alpha=0.7, col="white")+
geom_text(aes(label=lbl), vjust= 0.5, hjust= -0.5)+
coord_flip()+
theme_bw()+
labs(y="Percent", x="Purpose for loan")

- The loan amount by purpose
ggplot(loan_cleaned,
aes(x=loan_amnt, y=purpose, fill=purpose))+
geom_density_ridges()+
theme_ridges()+
theme(legend.position = "none")

- Default rate by purpose of Loan
Dp_table <-loan_cleaned%>%
group_by(purpose)%>%
count(Indicator)%>%
mutate(pct = n/sum(n),
lbl= scales::percent(pct))%>%
arrange(desc(n))
kable(Dp_table, format = "markdown")
| debt_consolidation |
Good |
482606 |
0.9206278 |
92.1% |
| credit_card |
Good |
194426 |
0.9429870 |
94.3% |
| home_improvement |
Good |
48276 |
0.9314476 |
93.1% |
| debt_consolidation |
Bad |
41608 |
0.0793722 |
7.9% |
| other |
Good |
38731 |
0.9029468 |
90.3% |
| major_purchase |
Good |
15983 |
0.9251027 |
92.5% |
| credit_card |
Bad |
11755 |
0.0570130 |
5.7% |
| small_business |
Good |
8596 |
0.8283704 |
82.8% |
| car |
Good |
8267 |
0.9327541 |
93.3% |
| medical |
Good |
7736 |
0.9058548 |
90.6% |
| moving |
Good |
4807 |
0.8878833 |
88.8% |
| vacation |
Good |
4334 |
0.9151182 |
91.5% |
| other |
Bad |
4163 |
0.0970532 |
9.7% |
| home_improvement |
Bad |
3553 |
0.0685524 |
6.9% |
| house |
Good |
3290 |
0.8875101 |
88.8% |
| wedding |
Good |
2058 |
0.8768641 |
87.7% |
| small_business |
Bad |
1781 |
0.1716296 |
17.2% |
| major_purchase |
Bad |
1294 |
0.0748973 |
7.5% |
| medical |
Bad |
804 |
0.0941452 |
9.4% |
| moving |
Bad |
607 |
0.1121167 |
11.2% |
| car |
Bad |
596 |
0.0672459 |
6.7% |
| renewable_energy |
Good |
503 |
0.8747826 |
87.5% |
| house |
Bad |
417 |
0.1124899 |
11.2% |
| vacation |
Bad |
402 |
0.0848818 |
8.5% |
| educational |
Good |
335 |
0.7919622 |
79.2% |
| wedding |
Bad |
289 |
0.1231359 |
12.3% |
| educational |
Bad |
88 |
0.2080378 |
20.8% |
| renewable_energy |
Bad |
72 |
0.1252174 |
12.5% |
| Debt consolidation |
Good |
1 |
1.0000000 |
100% |
| Paying off my son’s dept… |
Good |
1 |
1.0000000 |
100% |
ggplot(Dp_table, aes(x=reorder(purpose,pct) ,y=pct, fill=Indicator))+
geom_bar(stat = "identity", col="black")+
theme(axis.text.x = element_text(angle = 90,hjust = 1))+
geom_text(aes(label=lbl), size= 3, position = position_stack(vjust=0.5))

- Education and Small business
Edu_small <- subset(loan_cleaned, loan_cleaned$purpose== "educational" | loan_cleaned$purpose== "small_business")
funtable<-Edu_small%>%
group_by(application_type, grade, term, Indicator)%>%
count()
ggplot(funtable,aes(axis1 = application_type ,
axis2 = grade,
axis3 = Indicator ,
y = n)) +
geom_alluvium(aes(fill = term)) +
geom_stratum() +
geom_text(stat = "stratum",label.strata = TRUE)

- Defualt rate by region and grade regarding customer with educational and small business purpose
a<-Edu_small%>%
group_by(addr_state, grade, Indicator)%>%
count(Indicator)%>%
arrange(desc(n))
summary(a$Indicator)
## Bad Good
## 260 316
ggplot(a, aes(x=grade, fill=Indicator))+
geom_bar(position = "stack")+
facet_wrap(~addr_state)

State and Loan
- Visualisation of state (Cateogrial variable) with Tree map
plotdata <- loan_cleaned%>%
count(addr_state)
ggplot(plotdata, aes(fill=addr_state,
area=n,
label=addr_state))+
geom_treemap(col="black")+
geom_treemap_text(color="white", place="center")+
theme(legend.position = "none")

ggplot(filter(loan_cleaned, addr_state != '22.32' & addr_state != '16.33'),aes(x=addr_state, y=loan_amnt, fill=addr_state))+
stat_summary(fun.y="sum", geom="bar")+
coord_flip()

- Heat Map of loan amount in all state
loan_cleaned$region <- loan_cleaned$addr_state
loan_cleaned$region <- as.factor(loan_cleaned$region)
levels(loan_cleaned$region)<- c("1","2", "alaska", "alabama","arkansas", "arizona", "california","colorado","connecticut","district of columbia","delaware","florida","georgia","hawaii","iowa","idaho","illinois","indiana","kansas","kentucky","louisiana","massachusetts","maryland","maine","michigan","minnesota","missouri","mississippi","montana","north carolina","north dakota","nebraska","new hampshire","new jersey","new mexico","nevada","new york","ohio","oklahoma","oregon","pennsylvania","rhode island","south carolina","south dakota","tennessee","texas","utah","virginia","vermont","washington","wisconsin","west virginia","wyoming")
all_states <- map_data("state")
state_by_loan<-loan_cleaned%>%
group_by(region)%>%
summarise(value=sum(loan_amnt), na.rm=TRUE)
state_by_loan$region <- as.character(state_by_loan$region)
Total <- merge(all_states, state_by_loan, by="region")
head(Total,3)
## region long lat group order subregion value na.rm
## 1 alabama -87.46201 30.38968 1 1 <NA> 164627650 TRUE
## 2 alabama -87.48493 30.37249 1 2 <NA> 164627650 TRUE
## 3 alabama -87.52503 30.37249 1 3 <NA> 164627650 TRUE
p <- ggplot()
p <- p + geom_polygon(data=Total, aes(x=long, y=lat, group = group, fill=Total$value),colour="white"
) + scale_fill_continuous(low = "skyblue", high = "darkblue", guide="colorbar")
P1 <- p + theme_bw() + labs(fill = "Gradient of loan amount"
,title = "Loan amount in all states", x="", y="")
P1 + scale_y_continuous(breaks=c()) + scale_x_continuous(breaks=c()) + theme(panel.border = element_blank())

a=data.table(table(loan_cleaned$addr_state))
setnames(a,c("region","count"))
a$region=sapply(state.name[match(a$region, state.abb)],tolower)
all_states <- map_data("state")
Total <- merge(all_states, a, by="region")
head(Total,3)
## region long lat group order subregion count
## 1 alabama -87.46201 30.38968 1 1 <NA> 11200
## 2 alabama -87.48493 30.37249 1 2 <NA> 11200
## 3 alabama -87.95475 30.24644 1 13 <NA> 11200
ggplot(Total, aes(x=long, y=lat, map_id = region)) +
geom_map(aes(fill= count), map = all_states)+
labs(title="Loan counts in each state",x="",y="")+
scale_fill_gradientn("",colours=terrain.colors(8),guide = "legend")+
theme_bw()

Loan Amount & Loan default by Terms
ggplot(filter(loan_cleaned, verification_status !='Nov-2015'),
aes(x=loan_amnt, fill=Indicator))+
geom_density()+
facet_grid(verification_status ~ term)

Income & Loan default
- Which state has the highest rate income?
all_states <- map_data("state")
Income_state<-loan_cleaned%>%
group_by(region)%>%
summarise(avg_income = mean(annual_inc))
t <- merge(all_states, Income_state, by="region")
head(t,3)
## region long lat group order subregion avg_income
## 1 alabama -87.46201 30.38968 1 1 <NA> NA
## 2 alabama -87.48493 30.37249 1 2 <NA> NA
## 3 alabama -87.52503 30.37249 1 3 <NA> NA
p3 <- ggplot()+
geom_polygon(data=t,aes(x=long, y=lat, group=group, fill=t$avg_income),colour="white") +
scale_fill_continuous(low = "skyblue", high = "darkblue", guide="colorbar")+
ggtitle("Average income by State")
p3

- Distribution of annaul income
Desc(loan_cleaned$annual_inc)
## ------------------------------------------------------------------------------
## loan_cleaned$annual_inc (factor)
##
## length n NAs unique levels dupes
## 887'379 887'379 0 49'386 49'386 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 60000.0 34'281 3.9% 34'281 3.9%
## 2 50000.0 30'575 3.4% 64'856 7.3%
## 3 65000.0 25'497 2.9% 90'353 10.2%
## 4 70000.0 24'121 2.7% 114'474 12.9%
## 5 40000.0 23'943 2.7% 138'417 15.6%
## 6 80000.0 22'729 2.6% 161'146 18.2%
## 7 45000.0 22'699 2.6% 183'845 20.7%
## 8 75000.0 22'435 2.5% 206'280 23.2%
## 9 55000.0 20'755 2.3% 227'035 25.6%
## 10 90000.0 17'159 1.9% 244'194 27.5%
## 11 100000.0 17'131 1.9% 261'325 29.4%
## 12 85000.0 15'648 1.8% 276'973 31.2%
## ... etc.
## [list output truncated]

loan_cleaned$annual_inc <-as.numeric(loan_cleaned$annual_in)
loan_cleaned <- mutate(loan_cleaned, inc_grp = annual_inc)
loan_cleaned$inc_grp[loan_cleaned$annual_inc <= 5000] <- "verylow_inc"
loan_cleaned$inc_grp[loan_cleaned$annual_inc > 5000 & loan_cleaned$annual_inc <= 8000] <- 'low_inc'
loan_cleaned$inc_grp[loan_cleaned$annual_inc > 8000 & loan_cleaned$annual_inc <= 12000] <- 'middle_inc'
loan_cleaned$inc_grp[loan_cleaned$annual_inc > 12000 & loan_cleaned$annual_inc <= 24000] <- 'high_inc'
loan_cleaned$inc_grp[loan_cleaned$annual_inc > 24000] <- 'veryhigh_inc'
loan_cleaned$inc_grp <-as.factor(loan_cleaned$inc_grp)
summary(loan_cleaned$inc_grp)
## high_inc low_inc middle_inc veryhigh_inc verylow_inc
## 205410 52085 49791 481830 98263
소득이 높은 그룹이 대출을 자주 하는것으로 나타난다.
pt1<-loan_cleaned%>%
group_by(inc_grp)%>%
summarise(count=n())%>%
mutate(pct = count/sum(count),
lbl= scales:: percent(pct))
kable(pt1)
| high_inc |
205410 |
0.2314794 |
23.1% |
| low_inc |
52085 |
0.0586953 |
5.9% |
| middle_inc |
49791 |
0.0561102 |
5.6% |
| veryhigh_inc |
481830 |
0.5429811 |
54.3% |
| verylow_inc |
98263 |
0.1107340 |
11.1% |
ggplot(pt1, aes(x=reorder(inc_grp, pct), y=pct))+
geom_bar(stat = "identity", fill="indianred3", alpha= 0.9, color="black")+
geom_text(aes(label=lbl), vjust=-0.25, hjust=0.5)+
coord_flip()+
theme_bw()+
labs(x="Gruop of Income", y="Percent")

- Default rate by income groups 소득이 높은 그룹이 채무 불 이행률이 가장 높은 것으로 나타났다. 다른 수입군에 비해 이자율의 차이는 없어보이며, 소득이 많은 그룹의 채무 불이행률에 대해서 이자율과는 낮은 상관이 있는것 같아 보인다.
ggplot(loan_cleaned, aes(x=reorder(inc_grp, annual_inc), fill=Indicator))+
geom_bar(position = "fill")+
theme_bw()

ggplot(loan_cleaned, aes(y= inc_grp, x=int_rate, fill=inc_grp))+
geom_density_ridges()+
theme_ridges()
