loans <- select(loans_full_schema, loan_amount, term, interest_rate, grade, annual_income, homeownership, state, debt_to_income)
table(loans$homeownership)
##
## ANY MORTGAGE OWN RENT
## 0 0 4789 1353 3858
#There are 3 distinct values, mortage is the most common one
length(unique(loans$interest_rate))
## [1] 58
table(loans$interest_rate)
##
## 5.31 5.32 6 6.07 6.08 6.71 6.72 7.34 7.35 7.96 7.97 9.43 9.44
## 188 234 3 202 277 192 312 243 325 211 274 280 367
## 9.92 9.93 10.41 10.42 10.9 10.91 11.98 11.99 12.61 12.62 13.58 13.59 14.07
## 248 390 194 346 275 306 255 376 264 333 225 347 183
## 14.08 15.04 15.05 16.01 16.02 17.09 17.47 18.06 18.45 19.03 19.42 20 20.39
## 318 199 304 196 284 195 124 176 146 197 114 137 93
## 21.45 21.85 22.9 22.91 23.87 23.88 24.84 24.85 25.81 25.82 26.3 26.77 28.72
## 172 90 13 28 20 37 31 42 26 47 53 38 31
## 29.69 30.17 30.65 30.75 30.79 30.94
## 9 9 5 4 11 1
#There are 58 distinct values and 11.99 is the most common one
table(loans$annual_income)
The result is not really helpful, there are too many distinct values
ggplot(data = loans)+
geom_histogram(mapping = aes(x = loan_amount), bins = 20, boundary= 0)

ggplot(data = loans)+
geom_histogram(mapping = aes(x = annual_income))

#The distance between the values on the x-axis is so large that we cannot see the difference between them
ggplot(data = loans, aes(x = debt_to_income)) +
geom_histogram(aes(y = after_stat(density)), binwidth = 2) +
xlim(0, 100)+
geom_density(linewidth = 1.2)

#The distribution is right-skewed, most of people have low debt to income rate
ggplot(data = loans) +
geom_point(mapping = aes(x = interest_rate, y = debt_to_income, color = grade))

ggplot(data = loans) +
geom_point(mapping = aes(x = loan_amount, y = interest_rate, color = term))
