glimpse(loans_full_schema)
## Rows: 10,000
## Columns: 55
## $ emp_title <chr> "global config engineer ", "warehouse…
## $ emp_length <dbl> 3, 10, 3, 1, 10, NA, 10, 10, 10, 3, 1…
## $ state <fct> NJ, HI, WI, PA, CA, KY, MI, AZ, NV, I…
## $ homeownership <fct> MORTGAGE, RENT, RENT, RENT, RENT, OWN…
## $ annual_income <dbl> 90000, 40000, 40000, 30000, 35000, 34…
## $ verified_income <fct> Verified, Not Verified, Source Verifi…
## $ debt_to_income <dbl> 18.01, 5.04, 21.15, 10.16, 57.96, 6.4…
## $ annual_income_joint <dbl> NA, NA, NA, NA, 57000, NA, 155000, NA…
## $ verification_income_joint <fct> , , , , Verified, , Not Verified, , ,…
## $ debt_to_income_joint <dbl> NA, NA, NA, NA, 37.66, NA, 13.12, NA,…
## $ delinq_2y <int> 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0…
## $ months_since_last_delinq <int> 38, NA, 28, NA, NA, 3, NA, 19, 18, NA…
## $ earliest_credit_line <dbl> 2001, 1996, 2006, 2007, 2008, 1990, 2…
## $ inquiries_last_12m <int> 6, 1, 4, 0, 7, 6, 1, 1, 3, 0, 4, 4, 8…
## $ total_credit_lines <int> 28, 30, 31, 4, 22, 32, 12, 30, 35, 9,…
## $ open_credit_lines <int> 10, 14, 10, 4, 16, 12, 10, 15, 21, 6,…
## $ total_credit_limit <int> 70795, 28800, 24193, 25400, 69839, 42…
## $ total_credit_utilized <int> 38767, 4321, 16000, 4997, 52722, 3898…
## $ num_collections_last_12m <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ num_historical_failed_to_pay <int> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ months_since_90d_late <int> 38, NA, 28, NA, NA, 60, NA, 71, 18, N…
## $ current_accounts_delinq <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ total_collection_amount_ever <int> 1250, 0, 432, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ current_installment_accounts <int> 2, 0, 1, 1, 1, 0, 2, 2, 6, 1, 2, 1, 2…
## $ accounts_opened_24m <int> 5, 11, 13, 1, 6, 2, 1, 4, 10, 5, 6, 7…
## $ months_since_last_credit_inquiry <int> 5, 8, 7, 15, 4, 5, 9, 7, 4, 17, 3, 4,…
## $ num_satisfactory_accounts <int> 10, 14, 10, 4, 16, 12, 10, 15, 21, 6,…
## $ num_accounts_120d_past_due <int> 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, …
## $ num_accounts_30d_past_due <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ num_active_debit_accounts <int> 2, 3, 3, 2, 10, 1, 3, 5, 11, 3, 2, 2,…
## $ total_debit_limit <int> 11100, 16500, 4300, 19400, 32700, 272…
## $ num_total_cc_accounts <int> 14, 24, 14, 3, 20, 27, 8, 16, 19, 7, …
## $ num_open_cc_accounts <int> 8, 14, 8, 3, 15, 12, 7, 12, 14, 5, 8,…
## $ num_cc_carrying_balance <int> 6, 4, 6, 2, 13, 5, 6, 10, 14, 3, 5, 3…
## $ num_mort_accounts <int> 1, 0, 0, 0, 0, 3, 2, 7, 2, 0, 2, 3, 3…
## $ account_never_delinq_percent <dbl> 92.9, 100.0, 93.5, 100.0, 100.0, 78.1…
## $ tax_liens <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ public_record_bankrupt <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ loan_purpose <fct> moving, debt_consolidation, other, de…
## $ application_type <fct> individual, individual, individual, i…
## $ loan_amount <int> 28000, 5000, 2000, 21600, 23000, 5000…
## $ term <dbl> 60, 36, 36, 36, 36, 36, 60, 60, 36, 3…
## $ interest_rate <dbl> 14.07, 12.61, 17.09, 6.72, 14.07, 6.7…
## $ installment <dbl> 652.53, 167.54, 71.40, 664.19, 786.87…
## $ grade <fct> C, C, D, A, C, A, C, B, C, A, C, B, C…
## $ sub_grade <fct> C3, C1, D1, A3, C3, A3, C2, B5, C2, A…
## $ issue_month <fct> Mar-2018, Feb-2018, Feb-2018, Jan-201…
## $ loan_status <fct> Current, Current, Current, Current, C…
## $ initial_listing_status <fct> whole, whole, fractional, whole, whol…
## $ disbursement_method <fct> Cash, Cash, Cash, Cash, Cash, Cash, C…
## $ balance <dbl> 27015.86, 4651.37, 1824.63, 18853.26,…
## $ paid_total <dbl> 1999.330, 499.120, 281.800, 3312.890,…
## $ paid_principal <dbl> 984.14, 348.63, 175.37, 2746.74, 1569…
## $ paid_interest <dbl> 1015.19, 150.49, 106.43, 566.15, 754.…
## $ paid_late_fees <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
map(loans_full_schema, ~sum(is.na(.)))
## $emp_title
## [1] 0
##
## $emp_length
## [1] 817
##
## $state
## [1] 0
##
## $homeownership
## [1] 0
##
## $annual_income
## [1] 0
##
## $verified_income
## [1] 0
##
## $debt_to_income
## [1] 24
##
## $annual_income_joint
## [1] 8505
##
## $verification_income_joint
## [1] 0
##
## $debt_to_income_joint
## [1] 8505
##
## $delinq_2y
## [1] 0
##
## $months_since_last_delinq
## [1] 5658
##
## $earliest_credit_line
## [1] 0
##
## $inquiries_last_12m
## [1] 0
##
## $total_credit_lines
## [1] 0
##
## $open_credit_lines
## [1] 0
##
## $total_credit_limit
## [1] 0
##
## $total_credit_utilized
## [1] 0
##
## $num_collections_last_12m
## [1] 0
##
## $num_historical_failed_to_pay
## [1] 0
##
## $months_since_90d_late
## [1] 7715
##
## $current_accounts_delinq
## [1] 0
##
## $total_collection_amount_ever
## [1] 0
##
## $current_installment_accounts
## [1] 0
##
## $accounts_opened_24m
## [1] 0
##
## $months_since_last_credit_inquiry
## [1] 1271
##
## $num_satisfactory_accounts
## [1] 0
##
## $num_accounts_120d_past_due
## [1] 318
##
## $num_accounts_30d_past_due
## [1] 0
##
## $num_active_debit_accounts
## [1] 0
##
## $total_debit_limit
## [1] 0
##
## $num_total_cc_accounts
## [1] 0
##
## $num_open_cc_accounts
## [1] 0
##
## $num_cc_carrying_balance
## [1] 0
##
## $num_mort_accounts
## [1] 0
##
## $account_never_delinq_percent
## [1] 0
##
## $tax_liens
## [1] 0
##
## $public_record_bankrupt
## [1] 0
##
## $loan_purpose
## [1] 0
##
## $application_type
## [1] 0
##
## $loan_amount
## [1] 0
##
## $term
## [1] 0
##
## $interest_rate
## [1] 0
##
## $installment
## [1] 0
##
## $grade
## [1] 0
##
## $sub_grade
## [1] 0
##
## $issue_month
## [1] 0
##
## $loan_status
## [1] 0
##
## $initial_listing_status
## [1] 0
##
## $disbursement_method
## [1] 0
##
## $balance
## [1] 0
##
## $paid_total
## [1] 0
##
## $paid_principal
## [1] 0
##
## $paid_interest
## [1] 0
##
## $paid_late_fees
## [1] 0
Summary <- loans_full_schema %>%
select(grade, sub_grade, annual_income) %>%
group_by(grade, sub_grade) %>%
arrange(grade, sub_grade)
show(Summary)
## # A tibble: 10,000 × 3
## # Groups: grade, sub_grade [32]
## grade sub_grade annual_income
## <fct> <fct> <dbl>
## 1 A A1 60000
## 2 A A1 188000
## 3 A A1 69000
## 4 A A1 126000
## 5 A A1 95000
## 6 A A1 75000
## 7 A A1 89000
## 8 A A1 150000
## 9 A A1 95000
## 10 A A1 90000
## # ℹ 9,990 more rows
ggplot(Summary) +
stat_summary(aes(grade, annual_income ,fill = sub_grade), fun = 'mean', geom = 'bar', position = 'dodge') +
labs(title = "The Relationship Of Customers' Grade And Annual Income",
x = "Grade Level",
y = "Average Annual Income (US Dollar)") +
scale_y_continuous(labels = scales::dollar) +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.3)))
Summary_2 <- loans_full_schema %>%
select(grade, sub_grade, annual_income, verified_income) %>%
filter(grade == c("F", "G")) %>%
group_by(grade, sub_grade) %>%
arrange(grade, sub_grade)
show(Summary_2)
## # A tibble: 37 × 4
## # Groups: grade, sub_grade [6]
## grade sub_grade annual_income verified_income
## <fct> <fct> <dbl> <fct>
## 1 F F1 64000 Source Verified
## 2 F F1 50840 Source Verified
## 3 F F1 190000 Source Verified
## 4 F F1 68274 Source Verified
## 5 F F1 48000 Source Verified
## 6 F F1 65000 Source Verified
## 7 F F1 88000 Verified
## 8 F F1 38000 Not Verified
## 9 F F1 65000 Not Verified
## 10 F F1 58000 Verified
## # ℹ 27 more rows
ggplot(Summary_2) +
geom_bar(aes(verified_income, fill = grade), position = "dodge") +
labs(title = "Information of Income verification",
x = "Verification State",
y = "Counts") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)))
Comments: It seems like the verfication State is not the main reason that the abnormal plot of Grade F and G. However, the total data count is pretty small.
Summary_2_1 <- loans_full_schema %>%
select(grade, sub_grade, annual_income, verified_income) %>%
group_by(grade, sub_grade) %>%
arrange(grade, sub_grade)
show(Summary_2_1)
## # A tibble: 10,000 × 4
## # Groups: grade, sub_grade [32]
## grade sub_grade annual_income verified_income
## <fct> <fct> <dbl> <fct>
## 1 A A1 60000 Not Verified
## 2 A A1 188000 Not Verified
## 3 A A1 69000 Not Verified
## 4 A A1 126000 Source Verified
## 5 A A1 95000 Not Verified
## 6 A A1 75000 Not Verified
## 7 A A1 89000 Not Verified
## 8 A A1 150000 Source Verified
## 9 A A1 95000 Not Verified
## 10 A A1 90000 Not Verified
## # ℹ 9,990 more rows
ggplot(Summary_2_1) +
geom_bar(aes(verified_income, fill = grade), position = "dodge") +
labs(title = "Information of Income verification",
x = "Verification State",
y = "Counts") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)))
Comments: Now it is clear that F and G have pretty small data compare to other grade. It is possible just because the lack of data that lead to the result.
Data <- loans_full_schema %>%
select(verified_income, interest_rate)
show(Data)
## # A tibble: 10,000 × 2
## verified_income interest_rate
## <fct> <dbl>
## 1 Verified 14.1
## 2 Not Verified 12.6
## 3 Source Verified 17.1
## 4 Not Verified 6.72
## 5 Verified 14.1
## 6 Not Verified 6.72
## 7 Source Verified 13.6
## 8 Source Verified 12.0
## 9 Source Verified 13.6
## 10 Not Verified 6.71
## # ℹ 9,990 more rows
ggplot(Data) +
stat_summary(aes(verified_income, interest_rate, fill = verified_income), fun = 'mean', geom = 'bar') +
labs(title = "The Relationship between Customers' Loan interest rate And Annual Income Verification State",
x = "Verification State",
y = "Average Interest Rate") +
scale_y_continuous(labels = scales::percent) +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.3)))
Comments: It is interesting to see that actually the average of loan interest rate of customers who has Verified annual income higher than people who has not being verified. Is it because the lack of data on Not Verified?
ggplot(Data) +
geom_bar(aes(verified_income, fill = verified_income)) +
labs(title = "Counts Of Customer Verification State",
x = "Verification State",
y = "Count") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.3)))
Comments: Data is not the case.