Q 10.1

Using the same crime data set uscrime.txt as in Questions 8.2 and 9.1, find the best model you can using 1. (a) a regression tree model, and 2. (b) a random forest model. In R, you can use the tree package or the rpart package, and the randomForest package. For each model, describe one or two qualitative takeaways you get from analyzing the results (i.e., don’t just stop when you have a good model, but interpret it too).

rm(list = ls())
set.seed(1)
uscrime <- read.table("uscrime.txt", header = TRUE)

# Load necessary packages
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(tree)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
# Build regression tree model using tree()
crimeTreeMod <- tree(Crime ~ ., data = uscrime)
summary(crimeTreeMod)
## 
## Regression tree:
## tree(formula = Crime ~ ., data = uscrime)
## Variables actually used in tree construction:
## [1] "Po1" "Pop" "LF"  "NW" 
## Number of terminal nodes:  7 
## Residual mean deviance:  47390 = 1896000 / 40 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -573.900  -98.300   -1.545    0.000  110.600  490.100
# see how tree was split
crimeTreeMod$frame
##       var  n        dev      yval splits.cutleft splits.cutright
## 1     Po1 47 6880927.66  905.0851          <7.65           >7.65
## 2     Pop 23  779243.48  669.6087          <22.5           >22.5
## 4      LF 12  243811.00  550.5000        <0.5675         >0.5675
## 8  <leaf>  7   48518.86  466.8571                               
## 9  <leaf>  5   77757.20  667.6000                               
## 5  <leaf> 11  179470.73  799.5455                               
## 3      NW 24 3604162.50 1130.7500          <7.65           >7.65
## 6     Pop 10  557574.90  886.9000          <21.5           >21.5
## 12 <leaf>  5  146390.80 1049.2000                               
## 13 <leaf>  5  147771.20  724.6000                               
## 7     Po1 14 2027224.93 1304.9286          <9.65           >9.65
## 14 <leaf>  6  170828.00 1041.0000                               
## 15 <leaf>  8 1124984.88 1502.8750
plot(crimeTreeMod)
text(crimeTreeMod)
title("USCRIME Training Set's Classification Tree")

# Prune the tree
termnodes <- 5
prune.crimeTreeMod <- prune.tree(crimeTreeMod, best = termnodes)
plot(prune.crimeTreeMod)
text(prune.crimeTreeMod)
title("Pruned Tree")

summary(prune.crimeTreeMod)
## 
## Regression tree:
## snip.tree(tree = crimeTreeMod, nodes = c(4L, 6L))
## Variables actually used in tree construction:
## [1] "Po1" "Pop" "NW" 
## Number of terminal nodes:  5 
## Residual mean deviance:  54210 = 2277000 / 42 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -573.9  -107.5    15.5     0.0   122.8   490.1
# Looks like pruning the tree actually increased the residual mean deviance, implying that it's a worse fit. Interesting.
# Look at the deviation and do cross validation

cv.crime <- cv.tree(crimeTreeMod)
prune.tree(crimeTreeMod)$size
## [1] 7 6 5 4 3 2 1
prune.tree(crimeTreeMod)$dev
## [1] 1895722 2013257 2276670 2632631 3364043 4383406 6880928
cv.crime$dev
## [1] 8065974 8528989 8365725 8369740 8169538 8115979 8259569
# This seems to indicate that overfitting is a big problem with our model
plot(cv.crime$size, cv.crime$dev, type = "b")

# This indicates that it's best to use the max number of terminal nodes, 7, as it has the least amount of error

termnodes2 <- 7
prune.crimeTreeMod2 <- prune.tree(crimeTreeMod, best = termnodes2)
plot(prune.crimeTreeMod2)
text(prune.crimeTreeMod2)
title("Pruned Tree")

summary(prune.crimeTreeMod2)
## 
## Regression tree:
## tree(formula = Crime ~ ., data = uscrime)
## Variables actually used in tree construction:
## [1] "Po1" "Pop" "LF"  "NW" 
## Number of terminal nodes:  7 
## Residual mean deviance:  47390 = 1896000 / 40 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -573.900  -98.300   -1.545    0.000  110.600  490.100
# Calculate quality of fit for this model
crimeTreePredict <- predict(prune.crimeTreeMod2, data = uscrime[,1:15])
RSS <- sum((crimeTreePredict - uscrime[,16])^2)
TSS <- sum((uscrime[,16] - mean(uscrime[,16]))^2)
R2 <- 1 - RSS/TSS
R2
## [1] 0.7244962

Takeaways: R2 was pretty high at 0.72. Though, overfitting may be a problem. 1. It seems like “Po1” is a really important predictor, and the dataset is basically split in half based on whether that’s > 7.7 or not. 2. It looks like NW is more important than Pop, since when we pruned the tree we dropped Pop but kept NW. It’s interesting that the model only used 3 predictors in assembling the tree.

# Create baseline randomForest Model
crime.rf <- randomForest(Crime ~ ., data=uscrime, importance = TRUE, nodesize = 5)
crime.rf.predict <- predict(crime.rf, data=uscrime[,-16])
RSS <- sum((crime.rf.predict - uscrime[,16])^2)
R2 <- 1 - RSS/TSS
R2
## [1] 0.4035882
# We get an R2 that's even worse than the previous tree. Perhaps I'll try using different values for mtry...
crime.rf2 <- randomForest(Crime ~ ., data=uscrime, importance = TRUE, nodesize = 5, mtry = 10)
crime.rf.predict2 <- predict(crime.rf2, data=uscrime[,-16])
RSS <- sum((crime.rf.predict2 - uscrime[,16])^2)
R2 <- 1 - RSS/TSS
R2
## [1] 0.3728571
# Let's make a loop to plug in different values of mtry and nodesize to try and find the model with the best R2
result.rf <- data.frame(matrix(nrow=5, ncol=3))
colnames(result.rf) <- c("NodeSize", "mtry", "R2")
i = 1
suppressWarnings(for (nodesize in 2:15) {
  for (m in 1:20) {
    model <- randomForest(Crime ~ ., data=uscrime, importance = TRUE, nodesize = nodesize, mtry = m)
    predict <- predict(model, data=uscrime[,-16])
    RSS <- sum((predict - uscrime[,16])^2)
    TSS <- sum((uscrime[,16] - mean(uscrime[,16]))^2)
    R2 <- 1 - RSS/TSS
    result.rf[i,1] <- nodesize
    result.rf[i,2] <- m
    result.rf[i,3] <- R2
    i = i + 1
  }
})
head(result.rf)
##   NodeSize mtry        R2
## 1        2    1 0.3905234
## 2        2    2 0.4238620
## 3        2    3 0.4402864
## 4        2    4 0.4275682
## 5        2    5 0.4249330
## 6        2    6 0.3990316
result.rf[which.max(result.rf[,3]),]
##    NodeSize mtry        R2
## 43        4    3 0.4520696
# So it looks like a nodesize of 4 and an mtry value of 3 gives us the best randomForest model.
crime.rf.final <- randomForest(Crime ~ ., data=uscrime, importance = TRUE, nodesize = 4, mtry = 3)
# Let's look at the importance of the variables for our final model...we can use the importance function
importance(crime.rf.final)
##           %IncMSE IncNodePurity
## M       1.9928339     200546.17
## So      2.0769532      31708.28
## Ed      4.6576592     291834.15
## Po1    10.5638376     919837.78
## Po2     9.0046553     959202.18
## LF      3.1941212     339000.28
## M.F     0.2105583     326088.42
## Pop     0.9004247     388810.29
## NW      9.5705949     572850.07
## U1     -0.2543793     178311.83
## U2      2.2667735     201369.99
## Wealth  5.5759825     722972.45
## Ineq    3.0309594     248048.13
## Prob    7.2194210     755163.72
## Time    1.3109350     250748.39
varImpPlot(crime.rf.final)

So it looks like the maximum R2 based on the above parameters is 0.45 at a nodesize of 4 and an mtry value (number of variables sampled for each split) of 3. We also saw in the final rf model that Po1, NW, and Po2 were the most important as measured by “%IncMSE” My takeaway for using randomForest is that it 1. is less accurate than a standard tree when looking at this data 2. it seems like increasing the number of variables used in the randomForest random sampling for each split actually decreases the accuracy of this model. This must have to do with overfitting, since this dataset is so small.

Q 10.2

Describe a situation or problem from your job, everyday life, current events, etc., for which a logistic regression model would be appropriate. List some (up to 5) predictors that you might use.

My firm relies on recurring membership revenue from clients. A client who drops membership is a big hit to our revenue, so retaining members is critical. I could use a logistic regression model to estimate the likelihood a client will cancel their membership, which would help our relationship managers focus on the high-risk clients. I’d use number of contacts with the client, attendance and our webinars/meetings, client margin last quarter, and ratio of active users on our website vs. total employees as predictors.

Q 10.3

1.Using the GermanCredit data set germancredit.txt from http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german / (description at http://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29 ), use logistic regression to find a good predictive model for whether credit applicants are good credit risks or not. Show your model (factors used and their coefficients), the software output, and the quality of fit. You can use the glm function in R. To get a logistic regression (logit) model on data where the response is either zero or one, use family=binomial(link=“logit”) in your glm function call. 2. Because the model gives a result between 0 and 1, it requires setting a threshold probability to separate between “good” and “bad” answers. In this data set, they estimate that incorrectly identifying a bad customer as good, is 5 times worse than incorrectly classifying a good customer as bad. Determine a good threshold probability based on your model.

set.seed(1)
credit <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data", header = FALSE)
str(credit)
## 'data.frame':    1000 obs. of  21 variables:
##  $ V1 : Factor w/ 4 levels "A11","A12","A13",..: 1 2 4 1 1 4 4 2 4 2 ...
##  $ V2 : int  6 48 12 42 24 36 24 36 12 30 ...
##  $ V3 : Factor w/ 5 levels "A30","A31","A32",..: 5 3 5 3 4 3 3 3 3 5 ...
##  $ V4 : Factor w/ 10 levels "A40","A41","A410",..: 5 5 8 4 1 8 4 2 5 1 ...
##  $ V5 : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ V6 : Factor w/ 5 levels "A61","A62","A63",..: 5 1 1 1 1 5 3 1 4 1 ...
##  $ V7 : Factor w/ 5 levels "A71","A72","A73",..: 5 3 4 4 3 3 5 3 4 1 ...
##  $ V8 : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ V9 : Factor w/ 4 levels "A91","A92","A93",..: 3 2 3 3 3 3 3 3 1 4 ...
##  $ V10: Factor w/ 3 levels "A101","A102",..: 1 1 1 3 1 1 1 1 1 1 ...
##  $ V11: int  4 2 3 4 4 4 4 2 4 2 ...
##  $ V12: Factor w/ 4 levels "A121","A122",..: 1 1 1 2 4 4 2 3 1 3 ...
##  $ V13: int  67 22 49 45 53 35 53 35 61 28 ...
##  $ V14: Factor w/ 3 levels "A141","A142",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ V15: Factor w/ 3 levels "A151","A152",..: 2 2 2 3 3 3 2 1 2 2 ...
##  $ V16: int  2 1 1 1 2 1 1 1 1 2 ...
##  $ V17: Factor w/ 4 levels "A171","A172",..: 3 3 2 3 3 2 3 4 2 4 ...
##  $ V18: int  1 1 2 2 2 2 1 1 1 1 ...
##  $ V19: Factor w/ 2 levels "A191","A192": 2 1 1 1 1 2 1 2 1 1 ...
##  $ V20: Factor w/ 2 levels "A201","A202": 1 1 1 1 1 1 1 1 1 1 ...
##  $ V21: int  1 2 1 1 2 1 1 1 1 2 ...
# See what the responses look like
table(credit$V21)
## 
##   1   2 
## 700 300
# Replace 1 and 2 with 0 and 1
credit$V21[credit$V21==1] <- 0
credit$V21[credit$V21==2] <- 1

# We should measure model quality in terms of accuracy (truePositive + trueNegative /(total predictions + actual values))

# Split data into train and validation set
creditPart <- createDataPartition(credit$V21, times = 1, p = 0.7, list=FALSE)
head(creditPart)
##      Resample1
## [1,]         1
## [2,]         2
## [3,]         4
## [4,]         5
## [5,]         7
## [6,]        11
creditTrain <- credit[creditPart,] 
creditValid <- credit[-creditPart,]

table(creditTrain$V21)
## 
##   0   1 
## 495 205
table(creditValid$V21)
## 
##   0   1 
## 205  95
creditLogModel <- glm(V21 ~ ., data = creditTrain, family=binomial(link="logit"))
# Look at importance of predictors
summary(creditLogModel)
## 
## Call:
## glm(formula = V21 ~ ., family = binomial(link = "logit"), data = creditTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3342  -0.6532  -0.3342   0.5314   2.5487  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.719e+00  1.365e+00   1.259 0.207907    
## V1A12       -1.509e-01  2.780e-01  -0.543 0.587178    
## V1A13       -1.729e+00  5.490e-01  -3.149 0.001637 ** 
## V1A14       -1.423e+00  2.886e-01  -4.932 8.16e-07 ***
## V2           3.397e-02  1.145e-02   2.966 0.003012 ** 
## V3A31       -5.534e-01  7.178e-01  -0.771 0.440774    
## V3A32       -9.670e-01  5.689e-01  -1.700 0.089158 .  
## V3A33       -1.300e+00  5.929e-01  -2.192 0.028351 *  
## V3A34       -2.078e+00  5.780e-01  -3.596 0.000323 ***
## V4A41       -2.239e+00  5.121e-01  -4.373 1.23e-05 ***
## V4A410      -3.146e+00  1.094e+00  -2.877 0.004015 ** 
## V4A42       -1.124e+00  3.287e-01  -3.420 0.000627 ***
## V4A43       -9.062e-01  3.064e-01  -2.958 0.003100 ** 
## V4A44       -2.663e-01  8.437e-01  -0.316 0.752277    
## V4A45       -5.171e-01  8.130e-01  -0.636 0.524778    
## V4A46        9.718e-03  4.976e-01   0.020 0.984418    
## V4A48       -2.013e+00  1.220e+00  -1.650 0.099011 .  
## V4A49       -1.007e+00  4.283e-01  -2.352 0.018692 *  
## V5           1.229e-04  5.744e-05   2.140 0.032360 *  
## V6A62       -6.178e-01  3.644e-01  -1.695 0.089999 .  
## V6A63       -8.173e-01  5.472e-01  -1.494 0.135252    
## V6A64       -6.326e-01  5.896e-01  -1.073 0.283282    
## V6A65       -1.284e+00  3.474e-01  -3.696 0.000219 ***
## V7A72        5.984e-02  5.171e-01   0.116 0.907865    
## V7A73       -3.262e-01  4.931e-01  -0.662 0.508180    
## V7A74       -9.421e-01  5.448e-01  -1.729 0.083751 .  
## V7A75       -5.208e-01  5.103e-01  -1.020 0.307522    
## V8           3.228e-01  1.150e-01   2.808 0.004993 ** 
## V9A92       -5.424e-01  4.800e-01  -1.130 0.258532    
## V9A93       -9.459e-01  4.679e-01  -2.022 0.043204 *  
## V9A94       -7.790e-01  5.753e-01  -1.354 0.175709    
## V10A102      4.178e-01  4.918e-01   0.850 0.395583    
## V10A103     -1.079e+00  5.364e-01  -2.012 0.044220 *  
## V11          1.581e-02  1.098e-01   0.144 0.885556    
## V12A122      6.148e-01  3.220e-01   1.909 0.056242 .  
## V12A123      4.082e-01  2.999e-01   1.361 0.173459    
## V12A124      1.228e+00  5.467e-01   2.247 0.024638 *  
## V13         -1.678e-02  1.180e-02  -1.422 0.155043    
## V14A142     -5.909e-01  5.155e-01  -1.146 0.251663    
## V14A143     -1.086e+00  2.972e-01  -3.654 0.000258 ***
## V15A152     -7.838e-01  3.021e-01  -2.595 0.009467 ** 
## V15A153     -1.350e+00  6.194e-01  -2.180 0.029239 *  
## V16          4.381e-01  2.499e-01   1.753 0.079537 .  
## V17A172      4.812e-01  8.505e-01   0.566 0.571528    
## V17A173      4.486e-01  8.190e-01   0.548 0.583825    
## V17A174      2.924e-01  8.287e-01   0.353 0.724258    
## V18          3.206e-02  3.177e-01   0.101 0.919610    
## V19A192     -2.567e-01  2.560e-01  -1.003 0.315973    
## V20A202     -1.678e+00  8.814e-01  -1.904 0.056863 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 846.57  on 699  degrees of freedom
## Residual deviance: 582.88  on 651  degrees of freedom
## AIC: 680.88
## 
## Number of Fisher Scoring iterations: 5
#Let's do a baseline prediction.
creditPredict <- predict(creditLogModel, newdata=creditValid[,-21], type="response")
table(creditValid$V21, round(creditPredict))
##    
##       0   1
##   0 179  26
##   1  53  42
# our initial baseline model using all predictors and a simple threshold of 0.5 seems to correctly classify a lot of the good borrowers (1), but misclassifies a lot of the bad borrowers. Given that the cost of bad borrowers is 5x that of misclassifying good borrowers, we should look for ways to minimize the misclassifying of bad borrowers (specificity). Let's start by only including variables that had a signficance of at least p <0.1

# Since there are multiple categorical variables within a single column, we need to manually remove the non-significant variables.
creditTrain$V1A14[creditTrain$V1 == "A14"] <- 1
creditTrain$V1A14[creditTrain$V1 != "A14"] <- 0

creditTrain$V3A34[creditTrain$V3 == "A34"] <- 1
creditTrain$V3A34[creditTrain$V3 != "A34"] <- 0

creditTrain$V4A41[creditTrain$V4 == "A41"] <- 1
creditTrain$V4A41[creditTrain$V4 != "A41"] <- 0

creditTrain$V4A43[creditTrain$V4 == "A43"] <- 1
creditTrain$V4A43[creditTrain$V4 != "A43"] <- 0

creditLogModel2 <- glm(V21 ~ V1A14+V2+V3A34+V4A41+V4A43, data = creditTrain, family=binomial(link="logit"))
summary(creditLogModel2)
## 
## Call:
## glm(formula = V21 ~ V1A14 + V2 + V3A34 + V4A41 + V4A43, family = binomial(link = "logit"), 
##     data = creditTrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6522  -0.8199  -0.4648   0.9390   2.3414  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.848240   0.202446  -4.190 2.79e-05 ***
## V1A14       -1.458644   0.220552  -6.614 3.75e-11 ***
## V2           0.039963   0.007447   5.367 8.02e-08 ***
## V3A34       -0.847121   0.234069  -3.619 0.000296 ***
## V4A41       -1.357456   0.395340  -3.434 0.000596 ***
## V4A43       -0.548921   0.220345  -2.491 0.012731 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 846.57  on 699  degrees of freedom
## Residual deviance: 709.32  on 694  degrees of freedom
## AIC: 721.32
## 
## Number of Fisher Scoring iterations: 5
# Now let's use the validation data and process it in the same way
creditValid$V1A14[creditValid$V1 == "A14"] <- 1
creditValid$V1A14[creditValid$V1 != "A14"] <- 0

creditValid$V3A34[creditValid$V3 == "A34"] <- 1
creditValid$V3A34[creditValid$V3 != "A34"] <- 0

creditValid$V4A41[creditValid$V4 == "A41"] <- 1
creditValid$V4A41[creditValid$V4 != "A41"] <- 0

creditValid$V4A43[creditValid$V4 == "A43"] <- 1
creditValid$V4A43[creditValid$V4 != "A43"] <- 0

# create confusion matrix of predicted vs. observed values on validation set
creditPredict2 <- predict(creditLogModel2, newdata=creditValid[,-21], type="response")
t <- as.matrix(table(round(creditPredict2), creditValid$V21))
names(dimnames(t)) <- c("Predicted", "Observed")
t
##          Observed
## Predicted   0   1
##         0 187  64
##         1  18  31
# Calculate accuracy and specificity, aiming to maximize specificity given the costs of a false positive
threshold <- 0.7
t2 <- as.matrix(table(round(creditPredict2 > threshold), creditValid$V21))
names(dimnames(t)) <- c("Predicted", "Observed")
t2
##    
##       0   1
##   0 202  91
##   1   3   4
accuracy <- (t2[1,1]+t2[2,2])/(t2[1,1]+t2[1,2]+t2[2,1]+t2[2,2])
accuracy
## [1] 0.6866667
specificity <- (t2[1,1])/(t2[1,1]+t2[2,1])
specificity
## [1] 0.9853659

So, with a threshold of 0.7, we arrive at specificity of 98.5% and overall accuracy of 68%, which for the purposes of this application, is pretty good.