This is a programming assignment.
Use the first half of this dataset to be the training set and the second half of this dataset to be the test set. Your task is to predict the credit rating of an individual.
Please ensure that the data are clean.
## rating experience homeown loandurn age mstat rcds jtype explvl
## 1 0 9 rent 60 30 married no_rec freelance 73
## 2 0 17 rent 60 58 widow no_rec fixed 48
## 3 1 10 owner 36 46 married yes_rec freelance 90
## 4 0 0 rent 60 24 single no_rec fixed 63
## 5 0 0 rent 36 26 single no_rec fixed 46
## 6 0 1 owner 60 36 married no_rec fixed 75
## inc assts debt loanamount purchprice
## 1 129 0 0 800 846
## 2 131 0 0 1000 1658
## 3 200 3000 0 2000 2985
## 4 182 2500 0 900 1325
## 5 107 0 0 310 910
## 6 214 3500 0 650 1645
## 'data.frame': 2223 obs. of 14 variables:
## $ rating : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 2 ...
## $ experience: int 9 17 10 0 0 1 29 9 0 0 ...
## $ homeown : Factor w/ 6 levels "ignore","other",..: 6 6 3 6 6 3 3 4 3 4 ...
## $ loandurn : int 60 60 36 60 36 60 60 12 60 48 ...
## $ age : int 30 58 46 24 26 36 44 27 32 41 ...
## $ mstat : Factor w/ 5 levels "divorced","married",..: 2 5 2 4 4 2 2 4 2 2 ...
## $ rcds : Factor w/ 2 levels "no_rec","yes_rec": 1 1 2 1 1 1 1 1 1 1 ...
## $ jtype : Factor w/ 4 levels "fixed","freelance",..: 2 1 2 1 1 1 1 1 2 4 ...
## $ explvl : int 73 48 90 63 46 75 75 35 90 90 ...
## $ inc : int 129 131 200 182 107 214 125 80 107 80 ...
## $ assts : int 0 0 3000 2500 0 3500 10000 0 15000 0 ...
## $ debt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ loanamount: int 800 1000 2000 900 310 650 1600 200 1200 1200 ...
## $ purchprice: int 846 1658 2985 1325 910 1645 1800 1093 1957 1468 ...
## [1] 0
## rating experience homeown loandurn age mstat
## 0 0 0 0 0 0
## rcds jtype explvl inc assts debt
## 0 0 0 0 0 0
## loanamount purchprice
## 0 0
library(ggplot2)
table(mydf$rating)
##
## 0 1
## 1595 628
table(mydf$homeown)
##
## ignore other owner parents priv rent
## 9 160 1072 362 104 516
g <- ggplot(data = mydf, aes(homeown, fill = homeown)) +
geom_bar() +
scale_fill_discrete(name="home ownership")
g
table(mydf$mstat)
##
## divorced married separated single widow
## 20 1647 59 463 34
g <- ggplot(data = mydf, aes(mstat, fill = mstat)) +
geom_bar() +
scale_fill_discrete(name="Marital status")
g
table(mydf$rcds)
##
## no_rec yes_rec
## 1896 327
g <- ggplot(data = mydf, aes(rcds, fill = rcds)) +
geom_bar() +
scale_fill_discrete(name="Existence of records")
g
table(mydf$jtype)
##
## fixed freelance others partime
## 1413 535 72 203
g <- ggplot(data = mydf, aes(jtype, fill = jtype)) +
geom_bar() +
scale_fill_discrete(name="Job type")
g
summary(mydf$experience)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 5.000 7.853 12.000 43.000
g <- ggplot(data = mydf, aes(experience, fill = experience)) +
geom_histogram()
g
summary(mydf$loandurn)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.00 36.00 48.00 46.36 60.00 60.00
g <- ggplot(data = mydf, aes(loandurn, fill = loandurn)) +
geom_histogram(breaks = seq(0, 60, 5))
g
summary(mydf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 28.00 35.00 37.07 45.00 68.00
g <- ggplot(data = mydf, aes(age, fill = age)) +
geom_histogram(breaks = seq(15, 70 ,5))
g
summary(mydf$explvl)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 35.00 45.00 60.00 60.91 75.00 173.00
g <- ggplot(data = mydf, aes(explvl, fill = explvl)) +
geom_histogram()
g
summary(mydf$inc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 93.0 130.0 148.6 180.0 959.0
g <- ggplot(data = mydf, aes(inc, fill = inc)) +
geom_histogram()
g
summary(mydf$assts)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 3000 5467 6000 300000
g <- ggplot(data = mydf, aes(assts, fill = assts)) +
geom_histogram()
g
summary(mydf$debt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.0 353.8 0.0 30000.0
g <- ggplot(data = mydf, aes(debt, fill = debt)) +
geom_histogram()
g
summary(mydf$loanamount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100 700 1000 1028 1300 5000
g <- ggplot(data = mydf, aes(loanamount, fill = loanamount)) +
geom_histogram()
g
summary(mydf$purchprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 105 1111 1386 1440 1683 8800
g <- ggplot(data = mydf, aes(purchprice, fill = purchprice)) +
geom_histogram()
g
## records
## rating no_rec yes_rec
## 0 1462 133
## 1 434 194
## jtype
## rating fixed freelance others partime
## 0 1104 362 39 90
## 1 309 173 33 113
## homeown
## rating ignore other owner parents priv rent
## 0 5 74 876 254 69 317
## 1 4 86 196 108 35 199
library(dplyr)
str(mydf)
## 'data.frame': 2223 obs. of 14 variables:
## $ rating : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 2 ...
## $ experience: int 9 17 10 0 0 1 29 9 0 0 ...
## $ homeown : Factor w/ 6 levels "ignore","other",..: 6 6 3 6 6 3 3 4 3 4 ...
## $ loandurn : int 60 60 36 60 36 60 60 12 60 48 ...
## $ age : int 30 58 46 24 26 36 44 27 32 41 ...
## $ mstat : Factor w/ 5 levels "divorced","married",..: 2 5 2 4 4 2 2 4 2 2 ...
## $ rcds : Factor w/ 2 levels "no_rec","yes_rec": 1 1 2 1 1 1 1 1 1 1 ...
## $ jtype : Factor w/ 4 levels "fixed","freelance",..: 2 1 2 1 1 1 1 1 2 4 ...
## $ explvl : int 73 48 90 63 46 75 75 35 90 90 ...
## $ inc : int 129 131 200 182 107 214 125 80 107 80 ...
## $ assts : int 0 0 3000 2500 0 3500 10000 0 15000 0 ...
## $ debt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ loanamount: int 800 1000 2000 900 310 650 1600 200 1200 1200 ...
## $ purchprice: int 846 1658 2985 1325 910 1645 1800 1093 1957 1468 ...
# Logistic Model
model.logistic <- glm(formula = rating ~ ., family =binomial(link='logit'), data = mydf)
summary(model.logistic)
##
## Call:
## glm(formula = rating ~ ., family = binomial(link = "logit"),
## data = mydf)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5219 -0.6973 -0.3956 0.6031 3.1915
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.738e-01 9.990e-01 0.274 0.784032
## experience -7.646e-02 1.048e-02 -7.293 3.03e-13 ***
## homeownother -3.322e-01 7.807e-01 -0.425 0.670501
## homeownowner -1.733e+00 7.607e-01 -2.278 0.022701 *
## homeownparents -1.404e+00 7.730e-01 -1.816 0.069310 .
## homeownpriv -1.019e+00 7.911e-01 -1.288 0.197857
## homeownrent -9.726e-01 7.646e-01 -1.272 0.203349
## loandurn 5.372e-03 4.882e-03 1.100 0.271222
## age 1.172e-02 6.999e-03 1.675 0.094032 .
## mstatmarried -1.474e+00 5.415e-01 -2.722 0.006481 **
## mstatseparated -1.728e-01 6.154e-01 -0.281 0.778852
## mstatsingle -9.441e-01 5.443e-01 -1.735 0.082814 .
## mstatwidow -1.976e+00 8.173e-01 -2.417 0.015639 *
## rcdsyes_rec 1.874e+00 1.539e-01 12.176 < 2e-16 ***
## jtypefreelance 8.816e-01 1.429e-01 6.168 6.93e-10 ***
## jtypeothers 1.113e+00 3.032e-01 3.671 0.000241 ***
## jtypepartime 1.275e+00 1.813e-01 7.034 2.00e-12 ***
## explvl 1.271e-02 3.813e-03 3.334 0.000855 ***
## inc -6.200e-03 8.841e-04 -7.012 2.34e-12 ***
## assts -2.638e-05 9.123e-06 -2.891 0.003834 **
## debt 1.920e-04 4.640e-05 4.138 3.50e-05 ***
## loanamount 1.975e-03 2.468e-04 8.002 1.23e-15 ***
## purchprice -1.022e-03 1.869e-04 -5.467 4.57e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2646.7 on 2222 degrees of freedom
## Residual deviance: 1954.8 on 2200 degrees of freedom
## AIC: 2000.8
##
## Number of Fisher Scoring iterations: 5
# Read test data
mydf.test <- test
mydf.test$rating <- ifelse(mydf.test$rating == "good",0,1)
model.logistic.fitted <- predict(model.logistic,mydf.test,type='response')
# How good our model is
model.logistic.fitted <- ifelse(model.logistic.fitted > 0.5,1,0)
misClassificationError <- mean(model.logistic.fitted != mydf.test$rating)
print(paste('Accuracy',round(1-misClassificationError,digits = 3)))
## [1] "Accuracy 0.792"
library(ROCR)
p <- predict(model.logistic, mydf.test, type="response")
pr <- prediction(p, mydf.test$rating)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
abline(a=0, b= 1)
auc1 <- performance(pr, measure = "auc")
auc1 <- auc1@y.values[[1]]
auc1
## [1] 0.8305952
Which variables in the logistic regression model are significant or important?
Please make a predictive model using a decision tree
library(rpart)
library(rpart.plot)
model.tree <- rpart(rating ~ ., mydf, method = "class")
rpart.plot(model.tree, type=1, extra = 102)
probs <- predict(model.tree, mydf.test, type = "prob")[,2]
library(ROCR)
pred <- prediction(probs, mydf.test$rating)
perf <- performance(pred, "tpr" , "fpr")
plot(perf)
abline(a=0, b= 1)
auc2 <- performance(pred, measure = "auc")
auc2 <- auc2@y.values[[1]]
auc2
## [1] 0.7518058
paste("Logistic:", round(auc1,digits = 3), sep = " ")
## [1] "Logistic: 0.831"
plot(prf)
abline(a=0, b= 1)
paste("Decision Tree:", round(auc2,digits = 3), sep = " ")
## [1] "Decision Tree: 0.752"
plot(perf)
abline(a=0, b= 1)
Find a way to explain sensitivity, specificity and auc in the ROC to your client (in less than 250 words).
Provide a brief write-up of your findings. How will you present these findings to your clients in less than 250 words?