Assignment004

This is a programming assignment.

Part A

DATA PREPARATION

Use the first half of this dataset to be the training set and the second half of this dataset to be the test set. Your task is to predict the credit rating of an individual.
Please ensure that the data are clean.

explore the data structure

##   rating experience homeown loandurn age   mstat    rcds     jtype explvl
## 1      0          9    rent       60  30 married  no_rec freelance     73
## 2      0         17    rent       60  58   widow  no_rec     fixed     48
## 3      1         10   owner       36  46 married yes_rec freelance     90
## 4      0          0    rent       60  24  single  no_rec     fixed     63
## 5      0          0    rent       36  26  single  no_rec     fixed     46
## 6      0          1   owner       60  36 married  no_rec     fixed     75
##   inc assts debt loanamount purchprice
## 1 129     0    0        800        846
## 2 131     0    0       1000       1658
## 3 200  3000    0       2000       2985
## 4 182  2500    0        900       1325
## 5 107     0    0        310        910
## 6 214  3500    0        650       1645

## 'data.frame':    2223 obs. of  14 variables:
##  $ rating    : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 2 ...
##  $ experience: int  9 17 10 0 0 1 29 9 0 0 ...
##  $ homeown   : Factor w/ 6 levels "ignore","other",..: 6 6 3 6 6 3 3 4 3 4 ...
##  $ loandurn  : int  60 60 36 60 36 60 60 12 60 48 ...
##  $ age       : int  30 58 46 24 26 36 44 27 32 41 ...
##  $ mstat     : Factor w/ 5 levels "divorced","married",..: 2 5 2 4 4 2 2 4 2 2 ...
##  $ rcds      : Factor w/ 2 levels "no_rec","yes_rec": 1 1 2 1 1 1 1 1 1 1 ...
##  $ jtype     : Factor w/ 4 levels "fixed","freelance",..: 2 1 2 1 1 1 1 1 2 4 ...
##  $ explvl    : int  73 48 90 63 46 75 75 35 90 90 ...
##  $ inc       : int  129 131 200 182 107 214 125 80 107 80 ...
##  $ assts     : int  0 0 3000 2500 0 3500 10000 0 15000 0 ...
##  $ debt      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ loanamount: int  800 1000 2000 900 310 650 1600 200 1200 1200 ...
##  $ purchprice: int  846 1658 2985 1325 910 1645 1800 1093 1957 1468 ...

Look for missing values

## [1] 0

##     rating experience    homeown   loandurn        age      mstat 
##          0          0          0          0          0          0 
##       rcds      jtype     explvl        inc      assts       debt 
##          0          0          0          0          0          0 
## loanamount purchprice 
##          0          0

Please provide the relevant exploratory and descriptive analysis

Descriptive stats

library(ggplot2)

table(mydf$rating)

## 
##    0    1 
## 1595  628

table(mydf$homeown)

## 
##  ignore   other   owner parents    priv    rent 
##       9     160    1072     362     104     516

g <- ggplot(data = mydf, aes(homeown, fill = homeown)) +
  geom_bar() +
  scale_fill_discrete(name="home ownership")
g

table(mydf$mstat)

## 
##  divorced   married separated    single     widow 
##        20      1647        59       463        34

g <- ggplot(data = mydf, aes(mstat, fill = mstat)) +
  geom_bar() +
  scale_fill_discrete(name="Marital status")
g

table(mydf$rcds)

## 
##  no_rec yes_rec 
##    1896     327

g <- ggplot(data = mydf, aes(rcds, fill = rcds)) +
  geom_bar() +
  scale_fill_discrete(name="Existence of records")
g

table(mydf$jtype)

## 
##     fixed freelance    others   partime 
##      1413       535        72       203

g <- ggplot(data = mydf, aes(jtype, fill = jtype)) +
  geom_bar() +
  scale_fill_discrete(name="Job type")
g

summary(mydf$experience)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   5.000   7.853  12.000  43.000

g <- ggplot(data = mydf, aes(experience, fill = experience)) +
  geom_histogram() 
g

summary(mydf$loandurn)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.00   36.00   48.00   46.36   60.00   60.00

g <- ggplot(data = mydf, aes(loandurn, fill = loandurn)) +
  geom_histogram(breaks = seq(0, 60, 5)) 
g

summary(mydf$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   28.00   35.00   37.07   45.00   68.00

g <- ggplot(data = mydf, aes(age, fill = age)) +
  geom_histogram(breaks = seq(15, 70 ,5))
g

summary(mydf$explvl)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   35.00   45.00   60.00   60.91   75.00  173.00

g <- ggplot(data = mydf, aes(explvl, fill = explvl)) +
  geom_histogram() 
g

summary(mydf$inc)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    93.0   130.0   148.6   180.0   959.0

g <- ggplot(data = mydf, aes(inc, fill = inc)) +
  geom_histogram() 
g

summary(mydf$assts)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0    3000    5467    6000  300000

g <- ggplot(data = mydf, aes(assts, fill = assts)) +
  geom_histogram() 
g

summary(mydf$debt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     0.0   353.8     0.0 30000.0

g <- ggplot(data = mydf, aes(debt, fill = debt)) +
  geom_histogram() 
g

summary(mydf$loanamount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     100     700    1000    1028    1300    5000

g <- ggplot(data = mydf, aes(loanamount, fill = loanamount)) +
  geom_histogram() 
g

summary(mydf$purchprice)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     105    1111    1386    1440    1683    8800

g <- ggplot(data = mydf, aes(purchprice, fill = purchprice)) +
  geom_histogram() 
g

Two way analyses

##       records
## rating no_rec yes_rec
##      0   1462     133
##      1    434     194

##       jtype
## rating fixed freelance others partime
##      0  1104       362     39      90
##      1   309       173     33     113

##       homeown
## rating ignore other owner parents priv rent
##      0      5    74   876     254   69  317
##      1      4    86   196     108   35  199

Please explain how this descriptive analysis informed your next steps.

Part B

Please make a predictive model using logistic regression.

Create a correlation heat map

library(dplyr)

str(mydf)

## 'data.frame':    2223 obs. of  14 variables:
##  $ rating    : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 2 ...
##  $ experience: int  9 17 10 0 0 1 29 9 0 0 ...
##  $ homeown   : Factor w/ 6 levels "ignore","other",..: 6 6 3 6 6 3 3 4 3 4 ...
##  $ loandurn  : int  60 60 36 60 36 60 60 12 60 48 ...
##  $ age       : int  30 58 46 24 26 36 44 27 32 41 ...
##  $ mstat     : Factor w/ 5 levels "divorced","married",..: 2 5 2 4 4 2 2 4 2 2 ...
##  $ rcds      : Factor w/ 2 levels "no_rec","yes_rec": 1 1 2 1 1 1 1 1 1 1 ...
##  $ jtype     : Factor w/ 4 levels "fixed","freelance",..: 2 1 2 1 1 1 1 1 2 4 ...
##  $ explvl    : int  73 48 90 63 46 75 75 35 90 90 ...
##  $ inc       : int  129 131 200 182 107 214 125 80 107 80 ...
##  $ assts     : int  0 0 3000 2500 0 3500 10000 0 15000 0 ...
##  $ debt      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ loanamount: int  800 1000 2000 900 310 650 1600 200 1200 1200 ...
##  $ purchprice: int  846 1658 2985 1325 910 1645 1800 1093 1957 1468 ...

# Logistic Model
model.logistic <- glm(formula = rating ~ ., family =binomial(link='logit'), data = mydf)
summary(model.logistic)

## 
## Call:
## glm(formula = rating ~ ., family = binomial(link = "logit"), 
##     data = mydf)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5219  -0.6973  -0.3956   0.6031   3.1915  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     2.738e-01  9.990e-01   0.274 0.784032    
## experience     -7.646e-02  1.048e-02  -7.293 3.03e-13 ***
## homeownother   -3.322e-01  7.807e-01  -0.425 0.670501    
## homeownowner   -1.733e+00  7.607e-01  -2.278 0.022701 *  
## homeownparents -1.404e+00  7.730e-01  -1.816 0.069310 .  
## homeownpriv    -1.019e+00  7.911e-01  -1.288 0.197857    
## homeownrent    -9.726e-01  7.646e-01  -1.272 0.203349    
## loandurn        5.372e-03  4.882e-03   1.100 0.271222    
## age             1.172e-02  6.999e-03   1.675 0.094032 .  
## mstatmarried   -1.474e+00  5.415e-01  -2.722 0.006481 ** 
## mstatseparated -1.728e-01  6.154e-01  -0.281 0.778852    
## mstatsingle    -9.441e-01  5.443e-01  -1.735 0.082814 .  
## mstatwidow     -1.976e+00  8.173e-01  -2.417 0.015639 *  
## rcdsyes_rec     1.874e+00  1.539e-01  12.176  < 2e-16 ***
## jtypefreelance  8.816e-01  1.429e-01   6.168 6.93e-10 ***
## jtypeothers     1.113e+00  3.032e-01   3.671 0.000241 ***
## jtypepartime    1.275e+00  1.813e-01   7.034 2.00e-12 ***
## explvl          1.271e-02  3.813e-03   3.334 0.000855 ***
## inc            -6.200e-03  8.841e-04  -7.012 2.34e-12 ***
## assts          -2.638e-05  9.123e-06  -2.891 0.003834 ** 
## debt            1.920e-04  4.640e-05   4.138 3.50e-05 ***
## loanamount      1.975e-03  2.468e-04   8.002 1.23e-15 ***
## purchprice     -1.022e-03  1.869e-04  -5.467 4.57e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2646.7  on 2222  degrees of freedom
## Residual deviance: 1954.8  on 2200  degrees of freedom
## AIC: 2000.8
## 
## Number of Fisher Scoring iterations: 5

# Read test data
mydf.test <- test
mydf.test$rating <- ifelse(mydf.test$rating == "good",0,1)
model.logistic.fitted <- predict(model.logistic,mydf.test,type='response')


# How good our model is
model.logistic.fitted <- ifelse(model.logistic.fitted > 0.5,1,0)
misClassificationError <- mean(model.logistic.fitted != mydf.test$rating)
print(paste('Accuracy',round(1-misClassificationError,digits = 3)))

## [1] "Accuracy 0.792"

library(ROCR)
p <- predict(model.logistic, mydf.test, type="response")
pr <- prediction(p, mydf.test$rating)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
abline(a=0, b= 1)

auc1 <- performance(pr, measure = "auc")
auc1 <- auc1@y.values[[1]]
auc1

## [1] 0.8305952

Which variables in the logistic regression model are significant or important?
Please make a predictive model using a decision tree

library(rpart)
library(rpart.plot)

model.tree <- rpart(rating ~ ., mydf, method = "class")
rpart.plot(model.tree, type=1, extra = 102)

probs <- predict(model.tree, mydf.test, type = "prob")[,2]

library(ROCR)
pred <- prediction(probs, mydf.test$rating)
perf <- performance(pred, "tpr" , "fpr")
plot(perf)
abline(a=0, b= 1)

auc2 <- performance(pred, measure = "auc")
auc2 <- auc2@y.values[[1]]
auc2

## [1] 0.7518058

Describe and explain the tree. How will you explain the tree to your client (less than 250 words)?

Part C

Please help your client understand which model is better for predicting purposes.

paste("Logistic:", round(auc1,digits = 3), sep = " ")

## [1] "Logistic: 0.831"

plot(prf)
abline(a=0, b= 1)

paste("Decision Tree:", round(auc2,digits = 3), sep = " ")

## [1] "Decision Tree: 0.752"

plot(perf)
abline(a=0, b= 1)

Find a way to explain sensitivity, specificity and auc in the ROC to your client (in less than 250 words).
Provide a brief write-up of your findings. How will you present these findings to your clients in less than 250 words?

Assignment004

Shuwen Wang

Octobor 6, 2017

Part A

Part B

Part C