library(MASS)
library(pROC)

Reading in cleaned up output dataset:

crime_df <- read.csv("https://raw.githubusercontent.com/mkivenson/Business-Analytics-Data-Mining/master/Classification%20Project/training_clean.csv")

Binary logistic regression models 1. StepAIC

We will begin by creating a regression with all independent variables and use stepAIC to come up with the best model.

logit_1 <- glm(target~., family = binomial, data = crime_df)
summary(logit_1)
## 
## Call:
## glm(formula = target ~ ., family = binomial, data = crime_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6102  -0.0515  -0.0001   0.0006   3.7286  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -43.347920   9.028483  -4.801 1.58e-06 ***
## X             0.001713   0.001798   0.953 0.340707    
## zn           -0.071313   0.044091  -1.617 0.105793    
## indus         0.181863   0.117304   1.550 0.121055    
## chas         -2.450089   2.555745  -0.959 0.337730    
## nox          52.885618  11.149131   4.743 2.10e-06 ***
## rm           -0.362011   0.930177  -0.389 0.697139    
## age           0.072527   0.020017   3.623 0.000291 ***
## dis           0.606912   0.305407   1.987 0.046898 *  
## rad           1.011640   0.250610   4.037 5.42e-05 ***
## tax          -0.025639   0.007880  -3.254 0.001140 ** 
## ptratio       0.408193   0.171870   2.375 0.017548 *  
## lstat        -0.001222   0.077155  -0.016 0.987361    
## medv          0.138610   0.086873   1.596 0.110592    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 605.53  on 436  degrees of freedom
## Residual deviance: 122.24  on 423  degrees of freedom
## AIC: 150.24
## 
## Number of Fisher Scoring iterations: 9
logit_2 <- stepAIC(logit_1)
## Start:  AIC=150.24
## target ~ X + zn + indus + chas + nox + rm + age + dis + rad + 
##     tax + ptratio + lstat + medv
## 
##           Df Deviance    AIC
## - lstat    1   122.24 148.24
## - rm       1   122.39 148.39
## - X        1   123.16 149.16
## - chas     1   123.34 149.34
## <none>         122.24 150.24
## - indus    1   124.77 150.77
## - medv     1   124.96 150.96
## - zn       1   125.49 151.49
## - dis      1   126.54 152.54
## - ptratio  1   128.33 154.33
## - tax      1   134.47 160.47
## - age      1   138.73 164.73
## - rad      1   153.68 179.68
## - nox      1   171.82 197.82
## 
## Step:  AIC=148.24
## target ~ X + zn + indus + chas + nox + rm + age + dis + rad + 
##     tax + ptratio + medv
## 
##           Df Deviance    AIC
## - rm       1   122.42 146.42
## - X        1   123.17 147.17
## - chas     1   123.37 147.37
## <none>         122.24 148.24
## - indus    1   124.87 148.87
## - medv     1   124.97 148.97
## - zn       1   125.54 149.54
## - dis      1   126.78 150.78
## - ptratio  1   128.36 152.36
## - tax      1   135.00 159.00
## - age      1   143.43 167.43
## - rad      1   153.75 177.75
## - nox      1   172.70 196.70
## 
## Step:  AIC=146.42
## target ~ X + zn + indus + chas + nox + age + dis + rad + tax + 
##     ptratio + medv
## 
##           Df Deviance    AIC
## - X        1   123.37 145.37
## - chas     1   123.49 145.49
## <none>         122.42 146.42
## - indus    1   124.97 146.97
## - zn       1   126.10 148.10
## - dis      1   126.79 148.79
## - medv     1   127.59 149.59
## - ptratio  1   128.51 150.51
## - tax      1   135.10 157.10
## - age      1   146.86 168.86
## - rad      1   153.75 175.75
## - nox      1   173.19 195.19
## 
## Step:  AIC=145.37
## target ~ zn + indus + chas + nox + age + dis + rad + tax + ptratio + 
##     medv
## 
##           Df Deviance    AIC
## - chas     1   124.47 144.47
## <none>         123.37 145.37
## - indus    1   125.53 145.53
## - zn       1   126.42 146.42
## - dis      1   127.34 147.34
## - medv     1   128.01 148.01
## - ptratio  1   129.38 149.38
## - tax      1   135.55 155.55
## - age      1   148.89 168.89
## - rad      1   154.22 174.22
## - nox      1   173.22 193.22
## 
## Step:  AIC=144.47
## target ~ zn + indus + nox + age + dis + rad + tax + ptratio + 
##     medv
## 
##           Df Deviance    AIC
## - indus    1   125.93 143.93
## <none>         124.47 144.47
## - zn       1   127.35 145.35
## - dis      1   128.78 146.78
## - medv     1   128.78 146.78
## - ptratio  1   131.36 149.36
## - tax      1   135.84 153.84
## - age      1   150.99 168.99
## - rad      1   154.79 172.79
## - nox      1   174.63 192.63
## 
## Step:  AIC=143.93
## target ~ zn + nox + age + dis + rad + tax + ptratio + medv
## 
##           Df Deviance    AIC
## <none>         125.93 143.93
## - zn       1   129.11 145.11
## - medv     1   130.37 146.37
## - dis      1   131.31 147.31
## - ptratio  1   132.49 148.49
## - tax      1   136.00 152.00
## - age      1   153.21 169.21
## - rad      1   154.97 170.97
## - nox      1   200.04 216.04
summary(logit_2)
## 
## Call:
## glm(formula = target ~ zn + nox + age + dis + rad + tax + ptratio + 
##     medv, family = binomial, data = crime_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2773  -0.0398  -0.0001   0.0007   3.9858  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -47.681276   8.458413  -5.637 1.73e-08 ***
## zn           -0.074786   0.048408  -1.545 0.122371    
## nox          60.487744  10.218536   5.919 3.23e-09 ***
## age           0.074793   0.016416   4.556 5.21e-06 ***
## dis           0.639652   0.290940   2.199 0.027909 *  
## rad           0.896830   0.240710   3.726 0.000195 ***
## tax          -0.019884   0.006812  -2.919 0.003513 ** 
## ptratio       0.370797   0.147048   2.522 0.011682 *  
## medv          0.096345   0.049702   1.938 0.052565 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 605.53  on 436  degrees of freedom
## Residual deviance: 125.93  on 428  degrees of freedom
## AIC: 143.93
## 
## Number of Fisher Scoring iterations: 9
summary(logit_2$fitted.values)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000000 0.0004909 0.3774819 0.4874142 0.9999997 1.0000000

The resulting AIC is 143.93 - we will compare it to other models to see if this is the best result.

Another technique for evaluating model perfomance is the area under the ROC Curve. Higher the area under the curve, better the prediction power of the model. AUC of a perfect predictive model equals 1.

roc(target~logit_2$fitted.values, data = crime_df, plot = TRUE, main = "ROC CURVE", col= "blue")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## 
## Call:
## roc.formula(formula = target ~ logit_2$fitted.values, data = crime_df,     plot = TRUE, main = "ROC CURVE", col = "blue")
## 
## Data: logit_2$fitted.values in 224 controls (target 0) < 213 cases (target 1).
## Area under the curve: 0.988
auc(target~logit_2$fitted.values, data = crime_df)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Area under the curve: 0.988

Our AUC is very close to 1 so we conclude this is a very good model in terms of prediciton accuracy.

Model 2. Forward Variable Selection approach.

fwd_start <- glm(target~1, family = binomial, data = crime_df)
summary(fwd_start)
## 
## Call:
## glm(formula = target ~ 1, family = binomial, data = crime_df)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.156  -1.156  -1.156   1.199   1.199  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.05035    0.09570  -0.526    0.599
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 605.53  on 436  degrees of freedom
## Residual deviance: 605.53  on 436  degrees of freedom
## AIC: 607.53
## 
## Number of Fisher Scoring iterations: 3
fwd_final <- step(fwd_start, direction = "forward", scope = formula(logit_1))
## Start:  AIC=607.53
## target ~ 1
## 
##           Df Deviance    AIC
## + nox      1   199.36 203.36
## + indus    1   319.69 323.69
## + age      1   320.19 324.19
## + dis      1   323.49 327.49
## + tax      1   358.56 362.56
## + rad      1   368.04 372.04
## + lstat    1   457.77 461.77
## + zn       1   471.36 475.36
## + medv     1   558.14 562.14
## + ptratio  1   570.40 574.40
## + rm       1   590.93 594.93
## <none>         605.53 607.53
## + chas     1   604.94 608.94
## + X        1   605.06 609.06
## 
## Step:  AIC=203.36
## target ~ nox
## 
##           Df Deviance    AIC
## + rad      1   173.35 179.35
## + age      1   191.03 197.03
## + rm       1   193.05 199.05
## + tax      1   194.14 200.14
## + indus    1   195.21 201.21
## + ptratio  1   196.45 202.45
## + zn       1   196.63 202.63
## + medv     1   196.92 202.92
## <none>         199.36 203.36
## + chas     1   198.31 204.31
## + dis      1   198.96 204.96
## + lstat    1   199.34 205.34
## + X        1   199.34 205.34
## 
## Step:  AIC=179.35
## target ~ nox + rad
## 
##           Df Deviance    AIC
## + age      1   160.17 168.17
## + tax      1   162.09 170.09
## + rm       1   169.38 177.38
## + zn       1   170.82 178.82
## + ptratio  1   170.86 178.86
## <none>         173.35 179.35
## + dis      1   172.19 180.19
## + medv     1   172.39 180.39
## + chas     1   172.70 180.70
## + X        1   173.27 181.27
## + lstat    1   173.30 181.30
## + indus    1   173.33 181.33
## 
## Step:  AIC=168.17
## target ~ nox + rad + age
## 
##           Df Deviance    AIC
## + tax      1   140.24 150.24
## + dis      1   156.46 166.46
## + rm       1   157.10 167.10
## + medv     1   157.80 167.80
## + lstat    1   158.16 168.16
## <none>         160.17 168.17
## + ptratio  1   159.01 169.01
## + zn       1   159.28 169.28
## + indus    1   159.85 169.85
## + chas     1   159.91 169.91
## + X        1   160.16 170.16
## 
## Step:  AIC=150.24
## target ~ nox + rad + age + tax
## 
##           Df Deviance    AIC
## + ptratio  1   134.12 146.12
## + zn       1   136.73 148.73
## + indus    1   138.01 150.01
## <none>         140.24 150.24
## + dis      1   138.54 150.54
## + chas     1   139.59 151.59
## + medv     1   140.16 152.16
## + X        1   140.16 152.16
## + rm       1   140.20 152.20
## + lstat    1   140.23 152.23
## 
## Step:  AIC=146.12
## target ~ nox + rad + age + tax + ptratio
## 
##         Df Deviance    AIC
## + indus  1   131.67 145.67
## <none>       134.12 146.12
## + zn     1   132.39 146.39
## + dis    1   132.70 146.70
## + medv   1   133.06 147.06
## + rm     1   133.48 147.48
## + lstat  1   133.59 147.59
## + chas   1   134.03 148.03
## + X      1   134.11 148.11
## 
## Step:  AIC=145.67
## target ~ nox + rad + age + tax + ptratio + indus
## 
##         Df Deviance    AIC
## <none>       131.67 145.67
## + zn     1   130.07 146.07
## + medv   1   130.43 146.43
## + dis    1   130.83 146.83
## + chas   1   130.88 146.88
## + rm     1   131.02 147.02
## + lstat  1   131.39 147.39
## + X      1   131.56 147.56
summary(fwd_final)
## 
## Call:
## glm(formula = target ~ nox + rad + age + tax + ptratio + indus, 
##     family = binomial, data = crime_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7278  -0.0618  -0.0006   0.0006   3.2970  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -33.776657   6.463156  -5.226 1.73e-07 ***
## nox          48.433668  10.325771   4.691 2.72e-06 ***
## rad           1.047638   0.218877   4.786 1.70e-06 ***
## age           0.064191   0.015211   4.220 2.44e-05 ***
## tax          -0.029223   0.006472  -4.515 6.32e-06 ***
## ptratio       0.296409   0.121030   2.449   0.0143 *  
## indus         0.148125   0.094505   1.567   0.1170    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 605.53  on 436  degrees of freedom
## Residual deviance: 131.67  on 430  degrees of freedom
## AIC: 145.67
## 
## Number of Fisher Scoring iterations: 9

We see the AIC of 145.67, which is a little higher than our earlier model but is still a very good result.

roc(target~fwd_final$fitted.values, data = crime_df, plot = TRUE, main = "ROC CURVE", col= "blue")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## 
## Call:
## roc.formula(formula = target ~ fwd_final$fitted.values, data = crime_df,     plot = TRUE, main = "ROC CURVE", col = "blue")
## 
## Data: fwd_final$fitted.values in 224 controls (target 0) < 213 cases (target 1).
## Area under the curve: 0.9865
auc(target~fwd_final$fitted.values, data = crime_df)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Area under the curve: 0.9865

The ROC is slightly lower as well - it is 0.9865 but it is still very close to 1 which means that the model is accurate.