install.packages("rpart")
install.packages("rpart.plot")
#loading in libraries
library(magrittr)
library(rpart)
library(rpart.plot)
Part 1 - Data Loading & Cleanup
set.seed(22)
#Read in all data
training_data = read.csv("/Users/yuyangwang 1/Desktop/OIDD 245/lab4-lending_club/LoanStats3c.csv", skip = 1)
test_data = read.csv("/Users/yuyangwang 1/Desktop/OIDD 245/lab4-lending_club/LoanStats3d.csv", skip = 1)
#Remove the last two rows
training_data = training_data[-c(nrow(training_data)), ]
training_data = training_data[-c(nrow(training_data)), ]
test_data = test_data[-c(nrow(test_data)), ]
test_data = test_data[-c(nrow(test_data)), ]
Part 2 - Descriptive Statistics
#if else to assign whether a row is highgrade or not
training_data$highgrade = ifelse(training_data$grade == "A" | training_data$grade == "B", 1, 0)
#find proportion of loans that are highgrade in training data
proportion = sum(training_data$highgrade) / nrow(training_data)
proportion
[1] 0.4160905
The proportion of loans that are considered highgrade is 0.4160905 whichis approximately 41.6%
Different t-tests
For all 3 factors (whether or not debtor’s income is above or below median income, whether or not debtor’s income is above or below median loan amount, and whether debtor rent their home or not), the p-values from the t-tests are less than 0.05, and therefore they are all statistically significant.
#above/below median income
training_data$aboveMedInc = ifelse(training_data$annual_inc > median(training_data$annual_inc), 1, 0)
t.test(training_data$aboveMedInc ~ training_data$highgrade)
Welch Two Sample t-test
data: training_data$aboveMedInc by training_data$highgrade
t = -45.062, df = 210228, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.09781757 -0.08966304
sample estimates:
mean in group 0 mean in group 1
0.4340485 0.5277888
#above/below median loan amount
training_data$aboveMedLoan = ifelse(training_data$loan_amnt > median(training_data$loan_amnt), 1, 0)
t.test(training_data$aboveMedLoan ~ training_data$highgrade)
Welch Two Sample t-test
data: training_data$aboveMedLoan by training_data$highgrade
t = 34.1, df = 211410, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
0.06697675 0.07514544
sample estimates:
mean in group 0 mean in group 1
0.5283604 0.4572993
#debtor rents home or not
training_data$hasRent = ifelse(training_data$home_ownership == "RENT", 1, 0)
t.test(training_data$hasRent ~ training_data$highgrade)
Welch Two Sample t-test
data: training_data$hasRent by training_data$highgrade
t = 14.687, df = 212877, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
0.02591320 0.03389428
sample estimates:
mean in group 0 mean in group 1
0.4057898 0.3758861
Part 3
Accuracy of this classifier on training data is: 0.6603021
Accuracy of classfier that assigns 0 to all rows: 0.5839059
Accuracy of classifier that assigns 0 or 1 randomly: 0.4160941
Therefore, the accuracy of thie classifier on the training data is more accurate than that of assigning 0 to all rows and assigning 0 or 1 randomly.
#glm to predict highgrade with home ownership, annual income, loan amount, verification status, and purpose
ml_model = glm(data = training_data, highgrade ~ home_ownership + annual_inc + loan_amnt + verification_status + purpose, family = binomial)
glm.fit: fitted probabilities numerically 0 or 1 occurred
#predict
training_data$prob = predict(ml_model, type="response")
training_data$prediction = ifelse(training_data$prob > 0.53, 1, 0)
mean(training_data$highgrade == training_data$prediction)
[1] 0.6603177
#all zeros
mean(training_data$highgrade == 0)
[1] 0.5839095
#randomly generate 0 or 1
mean(training_data$highgrade == (runif(nrow(training_data), 0, 1)< 1))
[1] 0.4160905
Part 4
Accuracy of this classifier on training data is: 0.6475828
Accuracy of classfier that assigns 0 to all rows: 0.5839059
Accuracy of classifier that assigns 0 or 1 randomly: 0.4160941
By comparing the accuracy of both models, the accuracy of the clasification model is slightly lower than that of the logistic model.
fit = rpart(highgrade ~ home_ownership + annual_inc + loan_amnt + verification_status + purpose, data = training_data, method="class")
rpart.plot(fit)

text(fit)
Error in rpartco(x) :
no information available on parameters from previous call to plot()
Part 5
For both the models, their predictions on the test data is slightly lower than their predictions on the training data; however that is expected. The accuracies are 0.6405417 and 0.6290051 for each model respectively. Therefore, the logistic model is slightly more accurate than the clasification model, though both are still higher than the accuracies when assigning 0 to all rows or assigining 0 or 1 randomly.
#removing row with education for purpose column
test_data = test_data[test_data$purpose != "educational",]
test_data$highgrade = ifelse(test_data$grade == "A" | test_data$grade == "B", 1, 0)
#glm predict on test data
test_data$prob = predict(ml_model, newdata=test_data, type="response")
test_data$prediction = ifelse(test_data$prob > 0.53, 1, 0)
mean(test_data$highgrade == test_data$prediction)
[1] 0.6405387
#tree predict on test data
y = predict(fit, test_data, type="class")
y = as.numeric(levels(y)[y])
mean(y == test_data$highgrade)
[1] 0.6290045
#all zeros
mean(test_data$highgrade == 0)
[1] 0.5465597
#randomly generate 0 or 1
mean(test_data$highgrade == (runif(nrow(test_data), 0, 1)< 1))
[1] 0.4534403
LS0tCnRpdGxlOiAiTGVuZGluZyBDbHViIEFuYWx5c2lzIC0gWXV5YW5nIFdhbmciCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KYGBge3J9Cmluc3RhbGwucGFja2FnZXMoInJwYXJ0IikKaW5zdGFsbC5wYWNrYWdlcygicnBhcnQucGxvdCIpCmBgYAoKCmBgYHtyfQojbG9hZGluZyBpbiBsaWJyYXJpZXMKbGlicmFyeShtYWdyaXR0cikKbGlicmFyeShycGFydCkKbGlicmFyeShycGFydC5wbG90KQpgYGAKCgpQYXJ0IDEgLSBEYXRhIExvYWRpbmcgJiBDbGVhbnVwCgpgYGB7cn0Kc2V0LnNlZWQoMjIpCmBgYAoKCmBgYHtyfQojUmVhZCBpbiBhbGwgZGF0YQp0cmFpbmluZ19kYXRhID0gcmVhZC5jc3YoIi9Vc2Vycy95dXlhbmd3YW5nIDEvRGVza3RvcC9PSUREIDI0NS9sYWI0LWxlbmRpbmdfY2x1Yi9Mb2FuU3RhdHMzYy5jc3YiLCBza2lwID0gMSkKdGVzdF9kYXRhID0gcmVhZC5jc3YoIi9Vc2Vycy95dXlhbmd3YW5nIDEvRGVza3RvcC9PSUREIDI0NS9sYWI0LWxlbmRpbmdfY2x1Yi9Mb2FuU3RhdHMzZC5jc3YiLCBza2lwID0gMSkKCiNSZW1vdmUgdGhlIGxhc3QgdHdvIHJvd3MKdHJhaW5pbmdfZGF0YSA9IHRyYWluaW5nX2RhdGFbLWMobnJvdyh0cmFpbmluZ19kYXRhKSksIF0KdHJhaW5pbmdfZGF0YSA9IHRyYWluaW5nX2RhdGFbLWMobnJvdyh0cmFpbmluZ19kYXRhKSksIF0KdGVzdF9kYXRhID0gdGVzdF9kYXRhWy1jKG5yb3codGVzdF9kYXRhKSksIF0KdGVzdF9kYXRhID0gdGVzdF9kYXRhWy1jKG5yb3codGVzdF9kYXRhKSksIF0KYGBgCgpQYXJ0IDIgLSBEZXNjcmlwdGl2ZSBTdGF0aXN0aWNzCgpgYGB7cn0KI2lmIGVsc2UgdG8gYXNzaWduIHdoZXRoZXIgYSByb3cgaXMgaGlnaGdyYWRlIG9yIG5vdAp0cmFpbmluZ19kYXRhJGhpZ2hncmFkZSA9IGlmZWxzZSh0cmFpbmluZ19kYXRhJGdyYWRlID09ICJBIiB8IHRyYWluaW5nX2RhdGEkZ3JhZGUgPT0gIkIiLCAxLCAwKQpgYGAKCmBgYHtyfQojZmluZCBwcm9wb3J0aW9uIG9mIGxvYW5zIHRoYXQgYXJlIGhpZ2hncmFkZSBpbiB0cmFpbmluZyBkYXRhCnByb3BvcnRpb24gPSBzdW0odHJhaW5pbmdfZGF0YSRoaWdoZ3JhZGUpIC8gbnJvdyh0cmFpbmluZ19kYXRhKQpwcm9wb3J0aW9uCmBgYAoKVGhlIHByb3BvcnRpb24gb2YgbG9hbnMgdGhhdCBhcmUgY29uc2lkZXJlZCBoaWdoZ3JhZGUgaXMgMC40MTYwOTA1IHdoaWNoaXMgYXBwcm94aW1hdGVseSA0MS42JQoKRGlmZmVyZW50IHQtdGVzdHMKCkZvciBhbGwgMyBmYWN0b3JzICh3aGV0aGVyIG9yIG5vdCBkZWJ0b3IncyBpbmNvbWUgaXMgYWJvdmUgb3IgYmVsb3cgbWVkaWFuIGluY29tZSwgd2hldGhlciBvciBub3QgZGVidG9yJ3MgaW5jb21lIGlzIGFib3ZlIG9yIGJlbG93IG1lZGlhbiBsb2FuIGFtb3VudCwgYW5kIHdoZXRoZXIgZGVidG9yIHJlbnQgdGhlaXIgaG9tZSBvciBub3QpLCB0aGUgcC12YWx1ZXMgZnJvbSB0aGUgdC10ZXN0cyBhcmUgbGVzcyB0aGFuIDAuMDUsIGFuZCB0aGVyZWZvcmUgdGhleSBhcmUgYWxsIHN0YXRpc3RpY2FsbHkgc2lnbmlmaWNhbnQuCgpgYGB7cn0KI2Fib3ZlL2JlbG93IG1lZGlhbiBpbmNvbWUKdHJhaW5pbmdfZGF0YSRhYm92ZU1lZEluYyA9IGlmZWxzZSh0cmFpbmluZ19kYXRhJGFubnVhbF9pbmMgPiBtZWRpYW4odHJhaW5pbmdfZGF0YSRhbm51YWxfaW5jKSwgMSwgMCkKdC50ZXN0KHRyYWluaW5nX2RhdGEkYWJvdmVNZWRJbmMgfiB0cmFpbmluZ19kYXRhJGhpZ2hncmFkZSkKCiNhYm92ZS9iZWxvdyBtZWRpYW4gbG9hbiBhbW91bnQKdHJhaW5pbmdfZGF0YSRhYm92ZU1lZExvYW4gPSBpZmVsc2UodHJhaW5pbmdfZGF0YSRsb2FuX2FtbnQgPiBtZWRpYW4odHJhaW5pbmdfZGF0YSRsb2FuX2FtbnQpLCAxLCAwKQp0LnRlc3QodHJhaW5pbmdfZGF0YSRhYm92ZU1lZExvYW4gfiB0cmFpbmluZ19kYXRhJGhpZ2hncmFkZSkKCiNkZWJ0b3IgcmVudHMgaG9tZSBvciBub3QKdHJhaW5pbmdfZGF0YSRoYXNSZW50ID0gaWZlbHNlKHRyYWluaW5nX2RhdGEkaG9tZV9vd25lcnNoaXAgPT0gIlJFTlQiLCAxLCAwKQp0LnRlc3QodHJhaW5pbmdfZGF0YSRoYXNSZW50IH4gdHJhaW5pbmdfZGF0YSRoaWdoZ3JhZGUpCmBgYAoKUGFydCAzCgpBY2N1cmFjeSBvZiB0aGlzIGNsYXNzaWZpZXIgb24gdHJhaW5pbmcgZGF0YSBpczogMC42NjAzMDIxCgpBY2N1cmFjeSBvZiBjbGFzc2ZpZXIgdGhhdCBhc3NpZ25zIDAgdG8gYWxsIHJvd3M6IDAuNTgzOTA1OQoKQWNjdXJhY3kgb2YgY2xhc3NpZmllciB0aGF0IGFzc2lnbnMgMCBvciAxIHJhbmRvbWx5OiAwLjQxNjA5NDEKClRoZXJlZm9yZSwgdGhlIGFjY3VyYWN5IG9mIHRoaWUgY2xhc3NpZmllciBvbiB0aGUgdHJhaW5pbmcgZGF0YSBpcyBtb3JlIGFjY3VyYXRlIHRoYW4gdGhhdCBvZiBhc3NpZ25pbmcgMCB0byBhbGwgcm93cyBhbmQgYXNzaWduaW5nIDAgb3IgMSByYW5kb21seS4KCmBgYHtyfQojZ2xtIHRvIHByZWRpY3QgaGlnaGdyYWRlIHdpdGggaG9tZSBvd25lcnNoaXAsIGFubnVhbCBpbmNvbWUsIGxvYW4gYW1vdW50LCB2ZXJpZmljYXRpb24gc3RhdHVzLCBhbmQgcHVycG9zZQptbF9tb2RlbCA9IGdsbShkYXRhID0gdHJhaW5pbmdfZGF0YSwgaGlnaGdyYWRlIH4gaG9tZV9vd25lcnNoaXAgKyBhbm51YWxfaW5jICsgbG9hbl9hbW50ICsgdmVyaWZpY2F0aW9uX3N0YXR1cyArIHB1cnBvc2UsIGZhbWlseSA9IGJpbm9taWFsKQoKI3ByZWRpY3QKdHJhaW5pbmdfZGF0YSRwcm9iID0gcHJlZGljdChtbF9tb2RlbCwgdHlwZT0icmVzcG9uc2UiKQp0cmFpbmluZ19kYXRhJHByZWRpY3Rpb24gPSBpZmVsc2UodHJhaW5pbmdfZGF0YSRwcm9iID4gMC41MywgMSwgMCkKbWVhbih0cmFpbmluZ19kYXRhJGhpZ2hncmFkZSA9PSB0cmFpbmluZ19kYXRhJHByZWRpY3Rpb24pCgojYWxsIHplcm9zCm1lYW4odHJhaW5pbmdfZGF0YSRoaWdoZ3JhZGUgPT0gMCkKCiNyYW5kb21seSBnZW5lcmF0ZSAwIG9yIDEKbWVhbih0cmFpbmluZ19kYXRhJGhpZ2hncmFkZSA9PSAocnVuaWYobnJvdyh0cmFpbmluZ19kYXRhKSwgMCwgMSk8IDEpKQpgYGAKClBhcnQgNAoKQWNjdXJhY3kgb2YgdGhpcyBjbGFzc2lmaWVyIG9uIHRyYWluaW5nIGRhdGEgaXM6IDAuNjQ3NTgyOAoKQWNjdXJhY3kgb2YgY2xhc3NmaWVyIHRoYXQgYXNzaWducyAwIHRvIGFsbCByb3dzOiAwLjU4MzkwNTkKCkFjY3VyYWN5IG9mIGNsYXNzaWZpZXIgdGhhdCBhc3NpZ25zIDAgb3IgMSByYW5kb21seTogMC40MTYwOTQxCgpCeSBjb21wYXJpbmcgdGhlIGFjY3VyYWN5IG9mIGJvdGggbW9kZWxzLCB0aGUgYWNjdXJhY3kgb2YgdGhlIGNsYXNpZmljYXRpb24gbW9kZWwgaXMgc2xpZ2h0bHkgbG93ZXIgdGhhbiB0aGF0IG9mIHRoZSBsb2dpc3RpYyBtb2RlbC4KCmBgYHtyfQpmaXQgPSBycGFydChoaWdoZ3JhZGUgfiBob21lX293bmVyc2hpcCArIGFubnVhbF9pbmMgKyBsb2FuX2FtbnQgKyB2ZXJpZmljYXRpb25fc3RhdHVzICsgcHVycG9zZSwgZGF0YSA9IHRyYWluaW5nX2RhdGEsIG1ldGhvZD0iY2xhc3MiKQoKcnBhcnQucGxvdChmaXQpCnRleHQoZml0KQoKI3ByZWRpY3Rpb24gb24gdHJhaW5pbmcgZGF0YQp6ID0gcHJlZGljdChmaXQsIHRyYWluaW5nX2RhdGEsIHR5cGU9ImNsYXNzIikKeiA9IGFzLm51bWVyaWMobGV2ZWxzKHopW3pdKQojY2JpbmQocHJlZGljdCA9IHosIGFjdHVhbF9ncmFkZSA9IHRyYWluaW5nX2RhdGEkaGlnaGdyYWRlKVssXQptZWFuKHogPT0gdHJhaW5pbmdfZGF0YSRoaWdoZ3JhZGUpCgojYWxsIHplcm9zCm1lYW4odHJhaW5pbmdfZGF0YSRoaWdoZ3JhZGUgPT0gMCkKCiNyYW5kb21seSBnZW5lcmF0ZSAwIG9yIDEKbWVhbih0cmFpbmluZ19kYXRhJGhpZ2hncmFkZSA9PSAocnVuaWYobnJvdyh0cmFpbmluZ19kYXRhKSwgMCwgMSk8IDEpKQpgYGAKClBhcnQgNQoKRm9yIGJvdGggdGhlIG1vZGVscywgdGhlaXIgcHJlZGljdGlvbnMgb24gdGhlIHRlc3QgZGF0YSBpcyBzbGlnaHRseSBsb3dlciB0aGFuIHRoZWlyIHByZWRpY3Rpb25zIG9uIHRoZSB0cmFpbmluZyBkYXRhOyBob3dldmVyIHRoYXQgaXMgZXhwZWN0ZWQuIFRoZSBhY2N1cmFjaWVzIGFyZSAwLjY0MDU0MTcgYW5kIDAuNjI5MDA1MSBmb3IgZWFjaCBtb2RlbCByZXNwZWN0aXZlbHkuIFRoZXJlZm9yZSwgdGhlIGxvZ2lzdGljIG1vZGVsIGlzIHNsaWdodGx5IG1vcmUgYWNjdXJhdGUgdGhhbiB0aGUgY2xhc2lmaWNhdGlvbiBtb2RlbCwgdGhvdWdoIGJvdGggYXJlIHN0aWxsIGhpZ2hlciB0aGFuIHRoZSBhY2N1cmFjaWVzIHdoZW4gYXNzaWduaW5nIDAgdG8gYWxsIHJvd3Mgb3IgYXNzaWdpbmluZyAwIG9yIDEgcmFuZG9tbHkuCgpgYGB7cn0KI3JlbW92aW5nIHJvdyB3aXRoIGVkdWNhdGlvbiBmb3IgcHVycG9zZSBjb2x1bW4gCnRlc3RfZGF0YSA9IHRlc3RfZGF0YVt0ZXN0X2RhdGEkcHVycG9zZSAhPSAiZWR1Y2F0aW9uYWwiLF0KdGVzdF9kYXRhJGhpZ2hncmFkZSA9IGlmZWxzZSh0ZXN0X2RhdGEkZ3JhZGUgPT0gIkEiIHwgdGVzdF9kYXRhJGdyYWRlID09ICJCIiwgMSwgMCkKCiNnbG0gcHJlZGljdCBvbiB0ZXN0IGRhdGEKdGVzdF9kYXRhJHByb2IgPSBwcmVkaWN0KG1sX21vZGVsLCBuZXdkYXRhPXRlc3RfZGF0YSwgdHlwZT0icmVzcG9uc2UiKQp0ZXN0X2RhdGEkcHJlZGljdGlvbiA9IGlmZWxzZSh0ZXN0X2RhdGEkcHJvYiA+IDAuNTMsIDEsIDApCm1lYW4odGVzdF9kYXRhJGhpZ2hncmFkZSA9PSB0ZXN0X2RhdGEkcHJlZGljdGlvbikKCiN0cmVlIHByZWRpY3Qgb24gdGVzdCBkYXRhCnkgPSBwcmVkaWN0KGZpdCwgdGVzdF9kYXRhLCB0eXBlPSJjbGFzcyIpCnkgPSBhcy5udW1lcmljKGxldmVscyh5KVt5XSkKbWVhbih5ID09IHRlc3RfZGF0YSRoaWdoZ3JhZGUpCgojYWxsIHplcm9zCm1lYW4odGVzdF9kYXRhJGhpZ2hncmFkZSA9PSAwKQoKI3JhbmRvbWx5IGdlbmVyYXRlIDAgb3IgMQptZWFuKHRlc3RfZGF0YSRoaWdoZ3JhZGUgPT0gKHJ1bmlmKG5yb3codGVzdF9kYXRhKSwgMCwgMSk8IDEpKQpgYGAKCg==