This is a programming assignment.
mydfall <- read.csv("creditdata.csv")
mydf <- mydfall[c(1:2223), ]
test <- mydfall[c(2224: 4446), ]
mydf$rating <- ifelse(mydf$rating == "good",0,1)
head(mydf)
## rating experience homeown loandurn age mstat rcds jtype explvl
## 1 0 9 rent 60 30 married no_rec freelance 73
## 2 0 17 rent 60 58 widow no_rec fixed 48
## 3 1 10 owner 36 46 married yes_rec freelance 90
## 4 0 0 rent 60 24 single no_rec fixed 63
## 5 0 0 rent 36 26 single no_rec fixed 46
## 6 0 1 owner 60 36 married no_rec fixed 75
## inc assts debt loanamount purchprice
## 1 129 0 0 800 846
## 2 131 0 0 1000 1658
## 3 200 3000 0 2000 2985
## 4 182 2500 0 900 1325
## 5 107 0 0 310 910
## 6 214 3500 0 650 1645
mydf$rating <- as.factor(mydf$rating)
str(mydf)
## 'data.frame': 2223 obs. of 14 variables:
## $ rating : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 2 ...
## $ experience: int 9 17 10 0 0 1 29 9 0 0 ...
## $ homeown : Factor w/ 6 levels "ignore","other",..: 6 6 3 6 6 3 3 4 3 4 ...
## $ loandurn : int 60 60 36 60 36 60 60 12 60 48 ...
## $ age : int 30 58 46 24 26 36 44 27 32 41 ...
## $ mstat : Factor w/ 5 levels "divorced","married",..: 2 5 2 4 4 2 2 4 2 2 ...
## $ rcds : Factor w/ 2 levels "no_rec","yes_rec": 1 1 2 1 1 1 1 1 1 1 ...
## $ jtype : Factor w/ 4 levels "fixed","freelance",..: 2 1 2 1 1 1 1 1 2 4 ...
## $ explvl : int 73 48 90 63 46 75 75 35 90 90 ...
## $ inc : int 129 131 200 182 107 214 125 80 107 80 ...
## $ assts : int 0 0 3000 2500 0 3500 10000 0 15000 0 ...
## $ debt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ loanamount: int 800 1000 2000 900 310 650 1600 200 1200 1200 ...
## $ purchprice: int 846 1658 2985 1325 910 1645 1800 1093 1957 1468 ...
## [1] 0
## rating experience homeown loandurn age mstat
## 0 0 0 0 0 0
## rcds jtype explvl inc assts debt
## 0 0 0 0 0 0
## loanamount purchprice
## 0 0
library(ggplot2)
table(mydf$rating)
##
## 0 1
## 1595 628
table(mydf$homeown)
##
## ignore other owner parents priv rent
## 9 160 1072 362 104 516
g <- ggplot(data = mydf, aes(homeown, fill = homeown)) +
geom_bar() +
scale_fill_discrete(name="home ownership")
g
table(mydf$mstat)
##
## divorced married separated single widow
## 20 1647 59 463 34
g <- ggplot(data = mydf, aes(mstat, fill = mstat)) +
geom_bar() +
scale_fill_discrete(name="Marital status")
g
table(mydf$rcds)
##
## no_rec yes_rec
## 1896 327
g <- ggplot(data = mydf, aes(rcds, fill = rcds)) +
geom_bar() +
scale_fill_discrete(name="Existence of records")
g
table(mydf$jtype)
##
## fixed freelance others partime
## 1413 535 72 203
g <- ggplot(data = mydf, aes(jtype, fill = jtype)) +
geom_bar() +
scale_fill_discrete(name="Job type")
g
summary(mydf$experience)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 5.000 7.853 12.000 43.000
g <- ggplot(data = mydf, aes(experience, fill = experience)) +
geom_histogram()
g
summary(mydf$loandurn)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.00 36.00 48.00 46.36 60.00 60.00
g <- ggplot(data = mydf, aes(loandurn, fill = loandurn)) +
geom_histogram(breaks = seq(0, 60, 5))
g
summary(mydf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 28.00 35.00 37.07 45.00 68.00
g <- ggplot(data = mydf, aes(age, fill = age)) +
geom_histogram(breaks = seq(15, 70 ,5))
g
summary(mydf$explvl)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 35.00 45.00 60.00 60.91 75.00 173.00
g <- ggplot(data = mydf, aes(explvl, fill = explvl)) +
geom_histogram()
g
summary(mydf$inc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 93.0 130.0 148.6 180.0 959.0
g <- ggplot(data = mydf, aes(inc, fill = inc)) +
geom_histogram()
g
summary(mydf$assts)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 3000 5467 6000 300000
g <- ggplot(data = mydf, aes(assts, fill = assts)) +
geom_histogram()
g
summary(mydf$debt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.0 353.8 0.0 30000.0
g <- ggplot(data = mydf, aes(debt, fill = debt)) +
geom_histogram()
g
summary(mydf$loanamount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100 700 1000 1028 1300 5000
g <- ggplot(data = mydf, aes(loanamount, fill = loanamount)) +
geom_histogram()
g
summary(mydf$purchprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 105 1111 1386 1440 1683 8800
g <- ggplot(data = mydf, aes(purchprice, fill = purchprice)) +
geom_histogram()
g
## records
## rating no_rec yes_rec
## 0 1462 133
## 1 434 194
## jtype
## rating fixed freelance others partime
## 0 1104 362 39 90
## 1 309 173 33 113
## homeown
## rating ignore other owner parents priv rent
## 0 5 74 876 254 69 317
## 1 4 86 196 108 35 199
library(dplyr)
str(mydf)
## 'data.frame': 2223 obs. of 14 variables:
## $ rating : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 2 ...
## $ experience: int 9 17 10 0 0 1 29 9 0 0 ...
## $ homeown : Factor w/ 6 levels "ignore","other",..: 6 6 3 6 6 3 3 4 3 4 ...
## $ loandurn : int 60 60 36 60 36 60 60 12 60 48 ...
## $ age : int 30 58 46 24 26 36 44 27 32 41 ...
## $ mstat : Factor w/ 5 levels "divorced","married",..: 2 5 2 4 4 2 2 4 2 2 ...
## $ rcds : Factor w/ 2 levels "no_rec","yes_rec": 1 1 2 1 1 1 1 1 1 1 ...
## $ jtype : Factor w/ 4 levels "fixed","freelance",..: 2 1 2 1 1 1 1 1 2 4 ...
## $ explvl : int 73 48 90 63 46 75 75 35 90 90 ...
## $ inc : int 129 131 200 182 107 214 125 80 107 80 ...
## $ assts : int 0 0 3000 2500 0 3500 10000 0 15000 0 ...
## $ debt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ loanamount: int 800 1000 2000 900 310 650 1600 200 1200 1200 ...
## $ purchprice: int 846 1658 2985 1325 910 1645 1800 1093 1957 1468 ...
# Logistic Model
model.logistic <- glm(formula = rating ~ ., family =binomial(link='logit'), data = mydf)
summary(model.logistic)
##
## Call:
## glm(formula = rating ~ ., family = binomial(link = "logit"),
## data = mydf)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5219 -0.6973 -0.3956 0.6031 3.1915
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.738e-01 9.990e-01 0.274 0.784032
## experience -7.646e-02 1.048e-02 -7.293 3.03e-13 ***
## homeownother -3.322e-01 7.807e-01 -0.425 0.670501
## homeownowner -1.733e+00 7.607e-01 -2.278 0.022701 *
## homeownparents -1.404e+00 7.730e-01 -1.816 0.069310 .
## homeownpriv -1.019e+00 7.911e-01 -1.288 0.197857
## homeownrent -9.726e-01 7.646e-01 -1.272 0.203349
## loandurn 5.372e-03 4.882e-03 1.100 0.271222
## age 1.172e-02 6.999e-03 1.675 0.094032 .
## mstatmarried -1.474e+00 5.415e-01 -2.722 0.006481 **
## mstatseparated -1.728e-01 6.154e-01 -0.281 0.778852
## mstatsingle -9.441e-01 5.443e-01 -1.735 0.082814 .
## mstatwidow -1.976e+00 8.173e-01 -2.417 0.015639 *
## rcdsyes_rec 1.874e+00 1.539e-01 12.176 < 2e-16 ***
## jtypefreelance 8.816e-01 1.429e-01 6.168 6.93e-10 ***
## jtypeothers 1.113e+00 3.032e-01 3.671 0.000241 ***
## jtypepartime 1.275e+00 1.813e-01 7.034 2.00e-12 ***
## explvl 1.271e-02 3.813e-03 3.334 0.000855 ***
## inc -6.200e-03 8.841e-04 -7.012 2.34e-12 ***
## assts -2.638e-05 9.123e-06 -2.891 0.003834 **
## debt 1.920e-04 4.640e-05 4.138 3.50e-05 ***
## loanamount 1.975e-03 2.468e-04 8.002 1.23e-15 ***
## purchprice -1.022e-03 1.869e-04 -5.467 4.57e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2646.7 on 2222 degrees of freedom
## Residual deviance: 1954.8 on 2200 degrees of freedom
## AIC: 2000.8
##
## Number of Fisher Scoring iterations: 5
# Read test data
mydf.test <- test
mydf.test$rating <- ifelse(mydf.test$rating == "good",0,1)
model.logistic.fitted <- predict(model.logistic,mydf.test,type='response')
# How good our model is
model.logistic.fitted <- ifelse(model.logistic.fitted > 0.5,1,0)
misClassificationError <- mean(model.logistic.fitted != mydf.test$rating)
print(paste('Accuracy',round(1-misClassificationError,digits = 3)))
## [1] "Accuracy 0.792"
library(ROCR)
p <- predict(model.logistic, mydf.test, type="response")
pr <- prediction(p, mydf.test$rating)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
abline(a=0, b= 1)
auc1 <- performance(pr, measure = "auc")
auc1 <- auc1@y.values[[1]]
auc1
## [1] 0.8305952
We regard variable that its p-value < 0.05 as significant or important variable in the logistic regression.
Significant variables:
library(rpart)
library(rpart.plot)
model.tree <- rpart(rating ~ ., mydf, method = "class")
rpart.plot(model.tree, type=1, extra = 102)
probs <- predict(model.tree, mydf.test, type = "prob")[,2]
library(ROCR)
pred <- prediction(probs, mydf.test$rating)
perf <- performance(pred, "tpr" , "fpr")
plot(perf)
abline(a=0, b= 1)
auc2 <- performance(pred, measure = "auc")
auc2 <- auc2@y.values[[1]]
auc2
## [1] 0.7518058
Compare the ROC curves and auc values (0.83 > 0.75) below, logistic regression is better for predicting purpose than decision tree.
paste("Logistic:", round(auc1,digits = 3), sep = " ")
## [1] "Logistic: 0.831"
plot(prf)
abline(a=0, b= 1)
paste("Decision Tree:", round(auc2,digits = 3), sep = " ")
## [1] "Decision Tree: 0.752"
plot(perf)
abline(a=0, b= 1)