library(MASS)
library(pROC)
Reading in cleaned up output dataset:
crime_df <- read.csv("https://raw.githubusercontent.com/mkivenson/Business-Analytics-Data-Mining/master/Classification%20Project/training_clean.csv")
We will begin by creating a regression with all independent variables and use stepAIC to come up with the best model.
logit_1 <- glm(target~., family = binomial, data = crime_df)
summary(logit_1)
##
## Call:
## glm(formula = target ~ ., family = binomial, data = crime_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6102 -0.0515 -0.0001 0.0006 3.7286
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -43.347920 9.028483 -4.801 1.58e-06 ***
## X 0.001713 0.001798 0.953 0.340707
## zn -0.071313 0.044091 -1.617 0.105793
## indus 0.181863 0.117304 1.550 0.121055
## chas -2.450089 2.555745 -0.959 0.337730
## nox 52.885618 11.149131 4.743 2.10e-06 ***
## rm -0.362011 0.930177 -0.389 0.697139
## age 0.072527 0.020017 3.623 0.000291 ***
## dis 0.606912 0.305407 1.987 0.046898 *
## rad 1.011640 0.250610 4.037 5.42e-05 ***
## tax -0.025639 0.007880 -3.254 0.001140 **
## ptratio 0.408193 0.171870 2.375 0.017548 *
## lstat -0.001222 0.077155 -0.016 0.987361
## medv 0.138610 0.086873 1.596 0.110592
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 605.53 on 436 degrees of freedom
## Residual deviance: 122.24 on 423 degrees of freedom
## AIC: 150.24
##
## Number of Fisher Scoring iterations: 9
logit_2 <- stepAIC(logit_1)
## Start: AIC=150.24
## target ~ X + zn + indus + chas + nox + rm + age + dis + rad +
## tax + ptratio + lstat + medv
##
## Df Deviance AIC
## - lstat 1 122.24 148.24
## - rm 1 122.39 148.39
## - X 1 123.16 149.16
## - chas 1 123.34 149.34
## <none> 122.24 150.24
## - indus 1 124.77 150.77
## - medv 1 124.96 150.96
## - zn 1 125.49 151.49
## - dis 1 126.54 152.54
## - ptratio 1 128.33 154.33
## - tax 1 134.47 160.47
## - age 1 138.73 164.73
## - rad 1 153.68 179.68
## - nox 1 171.82 197.82
##
## Step: AIC=148.24
## target ~ X + zn + indus + chas + nox + rm + age + dis + rad +
## tax + ptratio + medv
##
## Df Deviance AIC
## - rm 1 122.42 146.42
## - X 1 123.17 147.17
## - chas 1 123.37 147.37
## <none> 122.24 148.24
## - indus 1 124.87 148.87
## - medv 1 124.97 148.97
## - zn 1 125.54 149.54
## - dis 1 126.78 150.78
## - ptratio 1 128.36 152.36
## - tax 1 135.00 159.00
## - age 1 143.43 167.43
## - rad 1 153.75 177.75
## - nox 1 172.70 196.70
##
## Step: AIC=146.42
## target ~ X + zn + indus + chas + nox + age + dis + rad + tax +
## ptratio + medv
##
## Df Deviance AIC
## - X 1 123.37 145.37
## - chas 1 123.49 145.49
## <none> 122.42 146.42
## - indus 1 124.97 146.97
## - zn 1 126.10 148.10
## - dis 1 126.79 148.79
## - medv 1 127.59 149.59
## - ptratio 1 128.51 150.51
## - tax 1 135.10 157.10
## - age 1 146.86 168.86
## - rad 1 153.75 175.75
## - nox 1 173.19 195.19
##
## Step: AIC=145.37
## target ~ zn + indus + chas + nox + age + dis + rad + tax + ptratio +
## medv
##
## Df Deviance AIC
## - chas 1 124.47 144.47
## <none> 123.37 145.37
## - indus 1 125.53 145.53
## - zn 1 126.42 146.42
## - dis 1 127.34 147.34
## - medv 1 128.01 148.01
## - ptratio 1 129.38 149.38
## - tax 1 135.55 155.55
## - age 1 148.89 168.89
## - rad 1 154.22 174.22
## - nox 1 173.22 193.22
##
## Step: AIC=144.47
## target ~ zn + indus + nox + age + dis + rad + tax + ptratio +
## medv
##
## Df Deviance AIC
## - indus 1 125.93 143.93
## <none> 124.47 144.47
## - zn 1 127.35 145.35
## - dis 1 128.78 146.78
## - medv 1 128.78 146.78
## - ptratio 1 131.36 149.36
## - tax 1 135.84 153.84
## - age 1 150.99 168.99
## - rad 1 154.79 172.79
## - nox 1 174.63 192.63
##
## Step: AIC=143.93
## target ~ zn + nox + age + dis + rad + tax + ptratio + medv
##
## Df Deviance AIC
## <none> 125.93 143.93
## - zn 1 129.11 145.11
## - medv 1 130.37 146.37
## - dis 1 131.31 147.31
## - ptratio 1 132.49 148.49
## - tax 1 136.00 152.00
## - age 1 153.21 169.21
## - rad 1 154.97 170.97
## - nox 1 200.04 216.04
summary(logit_2)
##
## Call:
## glm(formula = target ~ zn + nox + age + dis + rad + tax + ptratio +
## medv, family = binomial, data = crime_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2773 -0.0398 -0.0001 0.0007 3.9858
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -47.681276 8.458413 -5.637 1.73e-08 ***
## zn -0.074786 0.048408 -1.545 0.122371
## nox 60.487744 10.218536 5.919 3.23e-09 ***
## age 0.074793 0.016416 4.556 5.21e-06 ***
## dis 0.639652 0.290940 2.199 0.027909 *
## rad 0.896830 0.240710 3.726 0.000195 ***
## tax -0.019884 0.006812 -2.919 0.003513 **
## ptratio 0.370797 0.147048 2.522 0.011682 *
## medv 0.096345 0.049702 1.938 0.052565 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 605.53 on 436 degrees of freedom
## Residual deviance: 125.93 on 428 degrees of freedom
## AIC: 143.93
##
## Number of Fisher Scoring iterations: 9
summary(logit_2$fitted.values)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000000 0.0004909 0.3774819 0.4874142 0.9999997 1.0000000
The resulting AIC is 143.93 - we will compare it to other models to see if this is the best result.
Another technique for evaluating model perfomance is the area under the ROC Curve. Higher the area under the curve, better the prediction power of the model. AUC of a perfect predictive model equals 1.
roc(target~logit_2$fitted.values, data = crime_df, plot = TRUE, main = "ROC CURVE", col= "blue")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.formula(formula = target ~ logit_2$fitted.values, data = crime_df, plot = TRUE, main = "ROC CURVE", col = "blue")
##
## Data: logit_2$fitted.values in 224 controls (target 0) < 213 cases (target 1).
## Area under the curve: 0.988
auc(target~logit_2$fitted.values, data = crime_df)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Area under the curve: 0.988
Our AUC is very close to 1 so we conclude this is a very good model in terms of prediciton accuracy.
fwd_start <- glm(target~1, family = binomial, data = crime_df)
summary(fwd_start)
##
## Call:
## glm(formula = target ~ 1, family = binomial, data = crime_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.156 -1.156 -1.156 1.199 1.199
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.05035 0.09570 -0.526 0.599
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 605.53 on 436 degrees of freedom
## Residual deviance: 605.53 on 436 degrees of freedom
## AIC: 607.53
##
## Number of Fisher Scoring iterations: 3
fwd_final <- step(fwd_start, direction = "forward", scope = formula(logit_1))
## Start: AIC=607.53
## target ~ 1
##
## Df Deviance AIC
## + nox 1 199.36 203.36
## + indus 1 319.69 323.69
## + age 1 320.19 324.19
## + dis 1 323.49 327.49
## + tax 1 358.56 362.56
## + rad 1 368.04 372.04
## + lstat 1 457.77 461.77
## + zn 1 471.36 475.36
## + medv 1 558.14 562.14
## + ptratio 1 570.40 574.40
## + rm 1 590.93 594.93
## <none> 605.53 607.53
## + chas 1 604.94 608.94
## + X 1 605.06 609.06
##
## Step: AIC=203.36
## target ~ nox
##
## Df Deviance AIC
## + rad 1 173.35 179.35
## + age 1 191.03 197.03
## + rm 1 193.05 199.05
## + tax 1 194.14 200.14
## + indus 1 195.21 201.21
## + ptratio 1 196.45 202.45
## + zn 1 196.63 202.63
## + medv 1 196.92 202.92
## <none> 199.36 203.36
## + chas 1 198.31 204.31
## + dis 1 198.96 204.96
## + lstat 1 199.34 205.34
## + X 1 199.34 205.34
##
## Step: AIC=179.35
## target ~ nox + rad
##
## Df Deviance AIC
## + age 1 160.17 168.17
## + tax 1 162.09 170.09
## + rm 1 169.38 177.38
## + zn 1 170.82 178.82
## + ptratio 1 170.86 178.86
## <none> 173.35 179.35
## + dis 1 172.19 180.19
## + medv 1 172.39 180.39
## + chas 1 172.70 180.70
## + X 1 173.27 181.27
## + lstat 1 173.30 181.30
## + indus 1 173.33 181.33
##
## Step: AIC=168.17
## target ~ nox + rad + age
##
## Df Deviance AIC
## + tax 1 140.24 150.24
## + dis 1 156.46 166.46
## + rm 1 157.10 167.10
## + medv 1 157.80 167.80
## + lstat 1 158.16 168.16
## <none> 160.17 168.17
## + ptratio 1 159.01 169.01
## + zn 1 159.28 169.28
## + indus 1 159.85 169.85
## + chas 1 159.91 169.91
## + X 1 160.16 170.16
##
## Step: AIC=150.24
## target ~ nox + rad + age + tax
##
## Df Deviance AIC
## + ptratio 1 134.12 146.12
## + zn 1 136.73 148.73
## + indus 1 138.01 150.01
## <none> 140.24 150.24
## + dis 1 138.54 150.54
## + chas 1 139.59 151.59
## + medv 1 140.16 152.16
## + X 1 140.16 152.16
## + rm 1 140.20 152.20
## + lstat 1 140.23 152.23
##
## Step: AIC=146.12
## target ~ nox + rad + age + tax + ptratio
##
## Df Deviance AIC
## + indus 1 131.67 145.67
## <none> 134.12 146.12
## + zn 1 132.39 146.39
## + dis 1 132.70 146.70
## + medv 1 133.06 147.06
## + rm 1 133.48 147.48
## + lstat 1 133.59 147.59
## + chas 1 134.03 148.03
## + X 1 134.11 148.11
##
## Step: AIC=145.67
## target ~ nox + rad + age + tax + ptratio + indus
##
## Df Deviance AIC
## <none> 131.67 145.67
## + zn 1 130.07 146.07
## + medv 1 130.43 146.43
## + dis 1 130.83 146.83
## + chas 1 130.88 146.88
## + rm 1 131.02 147.02
## + lstat 1 131.39 147.39
## + X 1 131.56 147.56
summary(fwd_final)
##
## Call:
## glm(formula = target ~ nox + rad + age + tax + ptratio + indus,
## family = binomial, data = crime_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7278 -0.0618 -0.0006 0.0006 3.2970
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -33.776657 6.463156 -5.226 1.73e-07 ***
## nox 48.433668 10.325771 4.691 2.72e-06 ***
## rad 1.047638 0.218877 4.786 1.70e-06 ***
## age 0.064191 0.015211 4.220 2.44e-05 ***
## tax -0.029223 0.006472 -4.515 6.32e-06 ***
## ptratio 0.296409 0.121030 2.449 0.0143 *
## indus 0.148125 0.094505 1.567 0.1170
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 605.53 on 436 degrees of freedom
## Residual deviance: 131.67 on 430 degrees of freedom
## AIC: 145.67
##
## Number of Fisher Scoring iterations: 9
We see the AIC of 145.67, which is a little higher than our earlier model but is still a very good result.
roc(target~fwd_final$fitted.values, data = crime_df, plot = TRUE, main = "ROC CURVE", col= "blue")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.formula(formula = target ~ fwd_final$fitted.values, data = crime_df, plot = TRUE, main = "ROC CURVE", col = "blue")
##
## Data: fwd_final$fitted.values in 224 controls (target 0) < 213 cases (target 1).
## Area under the curve: 0.9865
auc(target~fwd_final$fitted.values, data = crime_df)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Area under the curve: 0.9865
The ROC is slightly lower as well - it is 0.9865 but it is still very close to 1 which means that the model is accurate.