library(dplyr)
library(ggplot2)
library(corrplot)
setwd("C:\\R\\LoanData_RMarkdown")
getwd()
## [1] "C:/R/LoanData_RMarkdown"
loan <- read.csv("C:\\R\\LoanData_RMarkdown\\Loan payments data.csv", header = T)
head(loan)
## Loan_ID loan_status Principal terms effective_date due_date
## 1 xqd20166231 PAIDOFF 1000 30 9/8/2016 10/7/2016
## 2 xqd20168902 PAIDOFF 1000 30 9/8/2016 10/7/2016
## 3 xqd20160003 PAIDOFF 1000 30 9/8/2016 10/7/2016
## 4 xqd20160004 PAIDOFF 1000 15 9/8/2016 9/22/2016
## 5 xqd20160005 PAIDOFF 1000 30 9/9/2016 10/8/2016
## 6 xqd20160706 PAIDOFF 300 7 9/9/2016 9/15/2016
## paid_off_time past_due_days age education Gender
## 1 9/14/2016 19:31 NA 45 High School or Below male
## 2 10/7/2016 9:00 NA 50 Bechalor female
## 3 9/25/2016 16:58 NA 33 Bechalor female
## 4 9/22/2016 20:00 NA 27 college male
## 5 9/23/2016 21:36 NA 28 college female
## 6 9/9/2016 13:45 NA 35 Master or Above male
변수명 | 변수 설명 |
---|---|
Loan_ID | 대출받은 고객 ID |
paid off | 대출금 상환을 기간 내에 완료한 상태 |
collection | 대출금 상환을 기간 내에 완료하지 못하고 있는 상태(연체) |
collection_paid off | 대출금 상환을 만기 후에 완료한 상태 |
Principal | 대출 원금 |
terms | 1주일, 격주, 한달 단위의 상환 기간 |
effective_date | 대출 상환 계약일 |
due_date | 대출 만기일 |
paid_off_time | 대출금을 지급한 실제 기간 |
past_due_days | 만기되고 난 후 지난 대출 기간 |
age | 나이 |
education | 학력 수준(고졸 이하, 전문대졸, 학사, 석사이상) |
Gender | 성별 |
str(loan)
## 'data.frame': 500 obs. of 11 variables:
## $ Loan_ID : Factor w/ 500 levels "xqd12160159",..: 472 474 5 6 7 470 8 9 471 10 ...
## $ loan_status : Factor w/ 3 levels "COLLECTION","COLLECTION_PAIDOFF",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Principal : int 1000 1000 1000 1000 1000 300 1000 1000 1000 800 ...
## $ terms : int 30 30 30 15 30 7 30 30 30 15 ...
## $ effective_date: Factor w/ 7 levels "9/10/2016","9/11/2016",..: 6 6 6 6 7 7 7 7 7 1 ...
## $ due_date : Factor w/ 25 levels "10/10/2016","10/11/2016",..: 7 7 7 19 8 14 8 8 8 21 ...
## $ paid_off_time : Factor w/ 321 levels "","10/1/2016 10:18",..: 171 101 254 211 223 321 99 87 108 234 ...
## $ past_due_days : int NA NA NA NA NA NA NA NA NA NA ...
## $ age : int 45 50 33 27 28 35 29 36 28 26 ...
## $ education : Factor w/ 4 levels "Bechalor","college",..: 3 1 1 2 2 4 2 2 2 2 ...
## $ Gender : Factor w/ 2 levels "female","male": 2 1 1 2 1 2 2 2 2 2 ...
summary(loan)
## Loan_ID loan_status Principal
## xqd12160159: 1 COLLECTION :100 Min. : 300.0
## xqd20110409: 1 COLLECTION_PAIDOFF:100 1st Qu.:1000.0
## xqd20125284: 1 PAIDOFF :300 Median :1000.0
## xqd20151038: 1 Mean : 943.2
## xqd20160003: 1 3rd Qu.:1000.0
## xqd20160004: 1 Max. :1000.0
## (Other) :494
## terms effective_date due_date paid_off_time
## Min. : 7.00 9/10/2016: 46 10/10/2016:123 :100
## 1st Qu.:15.00 9/11/2016:231 9/25/2016 : 87 9/25/2016 9:00 : 9
## Median :30.00 9/12/2016:148 10/11/2016: 69 9/26/2016 9:00 : 9
## Mean :22.82 9/13/2016: 23 9/26/2016 : 60 10/10/2016 9:00 : 8
## 3rd Qu.:30.00 9/14/2016: 33 10/9/2016 : 28 10/13/2016 9:00 : 6
## Max. :30.00 9/8/2016 : 4 10/13/2016: 21 10/11/2016 16:00: 5
## 9/9/2016 : 15 (Other) :112 (Other) :363
## past_due_days age education Gender
## Min. : 1.00 Min. :18.00 Bechalor : 67 female: 77
## 1st Qu.: 3.00 1st Qu.:27.00 college :220 male :423
## Median :37.00 Median :30.00 High School or Below:209
## Mean :36.01 Mean :31.12 Master or Above : 4
## 3rd Qu.:60.00 3rd Qu.:35.00
## Max. :76.00 Max. :51.00
## NA's :300
데이터 확인 결과 : Dataframe / 500개의 관측치 / 11개의 변수
모든 관측치가 Unique한 Loan_ID와 날짜와 시간을 나타내는 데이터인 effective_date, due_date, paid_off_time, past_due_days 변수를 분석에 제외시키기로 판단함.
length(unique(loan$Loan_ID))
## [1] 500
loan_1 <- loan %>% select(-Loan_ID, -effective_date, -due_date, -paid_off_time, -past_due_days)
# loan_1 <- loan %>% select(-c(Loan_ID, effective_date, due_date, paid_off_time))
head(loan_1)
## loan_status Principal terms age education Gender
## 1 PAIDOFF 1000 30 45 High School or Below male
## 2 PAIDOFF 1000 30 50 Bechalor female
## 3 PAIDOFF 1000 30 33 Bechalor female
## 4 PAIDOFF 1000 15 27 college male
## 5 PAIDOFF 1000 30 28 college female
## 6 PAIDOFF 300 7 35 Master or Above male
loan_2 <- rename(loan_1, principal = Principal,
gender = Gender)
ls(loan_2)
## [1] "age" "education" "gender" "loan_status" "principal"
## [6] "terms"
str(loan_2)
## 'data.frame': 500 obs. of 6 variables:
## $ loan_status: Factor w/ 3 levels "COLLECTION","COLLECTION_PAIDOFF",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ principal : int 1000 1000 1000 1000 1000 300 1000 1000 1000 800 ...
## $ terms : int 30 30 30 15 30 7 30 30 30 15 ...
## $ age : int 45 50 33 27 28 35 29 36 28 26 ...
## $ education : Factor w/ 4 levels "Bechalor","college",..: 3 1 1 2 2 4 2 2 2 2 ...
## $ gender : Factor w/ 2 levels "female","male": 2 1 1 2 1 2 2 2 2 2 ...
summary(loan_2)
## loan_status principal terms age
## COLLECTION :100 Min. : 300.0 Min. : 7.00 Min. :18.00
## COLLECTION_PAIDOFF:100 1st Qu.:1000.0 1st Qu.:15.00 1st Qu.:27.00
## PAIDOFF :300 Median :1000.0 Median :30.00 Median :30.00
## Mean : 943.2 Mean :22.82 Mean :31.12
## 3rd Qu.:1000.0 3rd Qu.:30.00 3rd Qu.:35.00
## Max. :1000.0 Max. :30.00 Max. :51.00
## education gender
## Bechalor : 67 female: 77
## college :220 male :423
## High School or Below:209
## Master or Above : 4
##
##
Loan_ID, effective_date, due_date, paid_off_time 변수들 제거 확인
ggplot(data = loan_2, aes(x = terms, y = principal)) + geom_jitter() # geom_point()
상관계수를 구해보자
attach(loan_2)
cov(principal, terms)
## [1] 492.5884
cor(principal, terms, method = 'pearson')
## [1] 0.5343015
cor.test(principal, terms)
##
## Pearson's product-moment correlation
##
## data: principal and terms
## t = 14.106, df = 498, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4685647 0.5941541
## sample estimates:
## cor
## 0.5343015
공분산의 값은 492.5884로 양수이므로 양의 상관관계를 보이나 두 관측치의 단위의 크기에 영향을 받으므로 절대적이지 않음.
따라서 피어슨의 상관계수를 구해보면 약 0.53으로 양의 상관관계를 보이고 검정결과 p-value < 2.2e-16으로 귀무가설을 기각하므로 상관관계가 있다고 할 수 있다.
즉, 대출 상환 기간과 원금은 양의 상관관계가 있다고 볼 수 있다.
단순선형회귀분석
result <- lm(principal~terms)
result
##
## Call:
## lm(formula = principal ~ terms)
##
## Coefficients:
## (Intercept) terms
## 767.533 7.697
b0 = 767.533, b1 = 7.697로
principal = 767.533 + 7.697 * terms의 회귀식을 통해
대출 상환 기간이 1 증가할 때마다 원금이 7.697씩 증가한다고 볼 수 있다.
plot(principal~terms)
abline(result, col='red')
회귀모형의 적합성 검정
summary(result)
##
## Call:
## lm(formula = principal ~ terms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -521.41 1.57 1.57 1.57 178.59
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 767.5335 13.1950 58.17 <2e-16 ***
## terms 7.6966 0.5456 14.11 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 97.51 on 498 degrees of freedom
## Multiple R-squared: 0.2855, Adjusted R-squared: 0.284
## F-statistic: 199 on 1 and 498 DF, p-value: < 2.2e-16
F통계량의 p-value가 2.2e-16으로 귀무가설을 기각하여 회귀모형이 통계적으로 유의하다고 할 수 있다.
또한, Adjusted R-squared값은 0.284로 28%의 설명력을 가진다.
ggplot(data = loan_2, aes(x = age, y = principal)) + geom_jitter() # geom_point()
상관계수를 구해보자
cor.test(principal, age)
##
## Pearson's product-moment correlation
##
## data: principal and age
## t = -2.0757, df = 498, p-value = 0.03844
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.178850699 -0.004962316
## sample estimates:
## cor
## -0.0926126
상관계수의 검정 결과 p-value의 값이 0.03844로 귀무가설을 기각 할 수 있지만 상관계수의 값이 약 -0.1이므로 약한 음의 상관관계가 있다고 볼 수 있다.
cor_loan <- loan_2[2:4]
cor(cor_loan)
## principal terms age
## principal 1.0000000 0.53430152 -0.09261260
## terms 0.5343015 1.00000000 -0.07425869
## age -0.0926126 -0.07425869 1.00000000
ggplot(data = loan_2, aes(x = loan_status)) + geom_bar()
COLLECTION : 100, COLLECTION_PAIDOFF : 100, PAIDOFF : 300
loan_3 <- loan_2 %>%
group_by(loan_status, principal) %>%
summarise(count = n())
loan_3
## # A tibble: 11 x 3
## # Groups: loan_status [?]
## loan_status principal count
## <fctr> <int> <int>
## 1 COLLECTION 800 23
## 2 COLLECTION 1000 77
## 3 COLLECTION_PAIDOFF 500 1
## 4 COLLECTION_PAIDOFF 800 19
## 5 COLLECTION_PAIDOFF 1000 80
## 6 PAIDOFF 300 6
## 7 PAIDOFF 500 2
## 8 PAIDOFF 700 1
## 9 PAIDOFF 800 69
## 10 PAIDOFF 900 2
## 11 PAIDOFF 1000 220
PAIDOFF의 경우 주어진 기간내에 대출 상환을 완료한 경우이므로 제외시켜보기로 결정.
loan_4 <- loan_2 %>%
filter(loan_status %in% c("COLLECTION", "COLLECTION_PAIDOFF")) %>%
group_by(loan_status, principal) %>%
summarise(count = n())
## Warning: package 'bindrcpp' was built under R version 3.4.3
loan_4
## # A tibble: 5 x 3
## # Groups: loan_status [?]
## loan_status principal count
## <fctr> <int> <int>
## 1 COLLECTION 800 23
## 2 COLLECTION 1000 77
## 3 COLLECTION_PAIDOFF 500 1
## 4 COLLECTION_PAIDOFF 800 19
## 5 COLLECTION_PAIDOFF 1000 80
COLLECTION_PAIDOFF나 COLLECTION의 경우 대출 원금이 많은 것과 연관이 있음을 알 수 있다.
ggplot(data = loan_4, aes(x = principal, y = count, fill = loan_status)) + geom_bar(stat = "identity", position = "dodge")
그 결과, 대출 원금이 많을 수록 제 기간 안에 갚지 못하는 경우가 많음을 확인.
table(loan$Gender)
##
## female male
## 77 423
male : 423, female : 77로 남성의 수가 압도적으로 많음
loan_5 <- loan_2 %>%
group_by(loan_status, gender) %>%
summarise(count = n()) %>%
mutate(percent = count / sum(count))
loan_5
## # A tibble: 6 x 4
## # Groups: loan_status [3]
## loan_status gender count percent
## <fctr> <fctr> <int> <dbl>
## 1 COLLECTION female 10 0.1000000
## 2 COLLECTION male 90 0.9000000
## 3 COLLECTION_PAIDOFF female 14 0.1400000
## 4 COLLECTION_PAIDOFF male 86 0.8600000
## 5 PAIDOFF female 53 0.1766667
## 6 PAIDOFF male 247 0.8233333
mutate 함수를 사용해서 전체 인원에서 각 항목별로 퍼센트를 구하는 percent 변수를 추가했는데 전체 인원이 아닌 성별로 나누어서 구하는 방법은 없을까..?
ggplot(data = loan_5, aes(x = gender, y = count, fill = loan_status)) + geom_bar(stat = "identity", position = "dodge")
table(loan$education)
##
## Bechalor college High School or Below
## 67 220 209
## Master or Above
## 4
bechalor : 67, college : 220, High School or Below : 209, Master or Above : 4
loan_6 <- loan_2 %>%
group_by(loan_status, education) %>%
summarise(count = n())
loan_6
## # A tibble: 11 x 3
## # Groups: loan_status [?]
## loan_status education count
## <fctr> <fctr> <int>
## 1 COLLECTION Bechalor 11
## 2 COLLECTION college 39
## 3 COLLECTION High School or Below 49
## 4 COLLECTION Master or Above 1
## 5 COLLECTION_PAIDOFF Bechalor 15
## 6 COLLECTION_PAIDOFF college 48
## 7 COLLECTION_PAIDOFF High School or Below 37
## 8 PAIDOFF Bechalor 41
## 9 PAIDOFF college 133
## 10 PAIDOFF High School or Below 123
## 11 PAIDOFF Master or Above 3
education을 고졸이하를 1로, 그 이상을 2로 분류하기 위해 mutate 함수를 이용해보았다.
loan_7 <- loan_2 %>%
mutate(education_2 = ifelse(education == "High School or Below", 1, 2)) %>%
group_by(loan_status, education_2) %>%
summarise(count = n())
loan_7
## # A tibble: 6 x 3
## # Groups: loan_status [?]
## loan_status education_2 count
## <fctr> <dbl> <int>
## 1 COLLECTION 1 49
## 2 COLLECTION 2 51
## 3 COLLECTION_PAIDOFF 1 37
## 4 COLLECTION_PAIDOFF 2 63
## 5 PAIDOFF 1 123
## 6 PAIDOFF 2 177
ggplot(data = loan_6, aes(x = education, y = count, fill = loan_status)) + geom_bar(stat = "identity", position = "dodge")