library(forecast)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(e1071)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave
library(MASS)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble  1.4.2     ✔ purrr   0.2.5
## ✔ tidyr   0.8.1     ✔ dplyr   0.7.6
## ✔ readr   1.1.1     ✔ stringr 1.3.1
## ✔ tibble  1.4.2     ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine()  masks randomForest::combine()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ cowplot::ggsave() masks ggplot2::ggsave()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ ggplot2::margin() masks randomForest::margin()
## ✖ dplyr::select()   masks MASS::select()
library(broom)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
train_df <- as.data.frame(read.csv("https://raw.githubusercontent.com/simplymathematics/621/master/HW3/crime-training-data_modified.csv"))
test_df<- as.data.frame(read.csv("https://raw.githubusercontent.com/simplymathematics/621/master/HW3/crime-evaluation-data_modified.csv"))
str(train_df)
## 'data.frame':    466 obs. of  13 variables:
##  $ zn     : num  0 0 0 30 0 0 0 0 0 80 ...
##  $ indus  : num  19.58 19.58 18.1 4.93 2.46 ...
##  $ chas   : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.605 0.871 0.74 0.428 0.488 0.52 0.693 0.693 0.515 0.392 ...
##  $ rm     : num  7.93 5.4 6.49 6.39 7.16 ...
##  $ age    : num  96.2 100 100 7.8 92.2 71.3 100 100 38.1 19.1 ...
##  $ dis    : num  2.05 1.32 1.98 7.04 2.7 ...
##  $ rad    : int  5 5 24 6 3 5 24 24 5 1 ...
##  $ tax    : int  403 403 666 300 193 384 666 666 224 315 ...
##  $ ptratio: num  14.7 14.7 20.2 16.6 17.8 20.9 20.2 20.2 20.2 16.4 ...
##  $ lstat  : num  3.7 26.82 18.85 5.19 4.82 ...
##  $ medv   : num  50 13.4 15.4 23.7 37.9 26.5 5 7 22.2 20.9 ...
##  $ target : int  1 1 1 0 0 0 1 1 0 0 ...
train_df$target<-as.factor(train_df$target)
train_df$chas<-as.factor(train_df$chas)
colnames(train_df)
##  [1] "zn"      "indus"   "chas"    "nox"     "rm"      "age"     "dis"    
##  [8] "rad"     "tax"     "ptratio" "lstat"   "medv"    "target"

Spliting data set on train and test.

set.seed(123)
training.samples <- train_df$target %>% 
createDataPartition(p = 0.8, list = FALSE)
train.data  <- train_df[training.samples, ]
test.data <- train_df[-training.samples, ]

Confusion matrix function.

matrix<-function(model_name){
        probabilities <- model_name %>% predict(test.data, type = "response")
        predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
        confusionMatrix(as.factor(predicted.classes), as.factor(test.data$target), dnn = c("Prediction", "Reference"))
}

MODEL 1

Basic Model

model_1<- step(glm(target~., data = train_df, family = 'binomial'), direction = 'backward')
## Start:  AIC=218.05
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv
## 
##           Df Deviance    AIC
## - rm       1   192.71 216.71
## - lstat    1   192.77 216.77
## - chas     1   193.53 217.53
## - indus    1   193.99 217.99
## <none>         192.05 218.05
## - tax      1   196.59 220.59
## - zn       1   196.89 220.89
## - age      1   198.73 222.73
## - medv     1   199.95 223.95
## - ptratio  1   203.32 227.32
## - dis      1   203.84 227.84
## - rad      1   233.74 257.74
## - nox      1   265.05 289.05
## 
## Step:  AIC=216.71
## target ~ zn + indus + chas + nox + age + dis + rad + tax + ptratio + 
##     lstat + medv
## 
##           Df Deviance    AIC
## - chas     1   194.24 216.24
## - lstat    1   194.32 216.32
## - indus    1   194.58 216.58
## <none>         192.71 216.71
## - tax      1   197.59 219.59
## - zn       1   198.07 220.07
## - age      1   199.11 221.11
## - ptratio  1   203.53 225.53
## - dis      1   203.85 225.85
## - medv     1   205.35 227.35
## - rad      1   233.81 255.81
## - nox      1   265.14 287.14
## 
## Step:  AIC=216.24
## target ~ zn + indus + nox + age + dis + rad + tax + ptratio + 
##     lstat + medv
## 
##           Df Deviance    AIC
## - indus    1   195.51 215.51
## <none>         194.24 216.24
## - lstat    1   196.33 216.33
## - zn       1   200.59 220.59
## - tax      1   200.75 220.75
## - age      1   201.00 221.00
## - ptratio  1   203.94 223.94
## - dis      1   204.83 224.83
## - medv     1   207.12 227.12
## - rad      1   241.41 261.41
## - nox      1   265.19 285.19
## 
## Step:  AIC=215.51
## target ~ zn + nox + age + dis + rad + tax + ptratio + lstat + 
##     medv
## 
##           Df Deviance    AIC
## - lstat    1   197.32 215.32
## <none>         195.51 215.51
## - zn       1   202.05 220.05
## - age      1   202.23 220.23
## - ptratio  1   205.01 223.01
## - dis      1   205.96 223.96
## - tax      1   206.60 224.60
## - medv     1   208.13 226.13
## - rad      1   249.55 267.55
## - nox      1   270.59 288.59
## 
## Step:  AIC=215.32
## target ~ zn + nox + age + dis + rad + tax + ptratio + medv
## 
##           Df Deviance    AIC
## <none>         197.32 215.32
## - zn       1   203.45 219.45
## - ptratio  1   206.27 222.27
## - age      1   207.13 223.13
## - tax      1   207.62 223.62
## - dis      1   207.64 223.64
## - medv     1   208.65 224.65
## - rad      1   250.98 266.98
## - nox      1   273.18 289.18
summary(model_1)
## 
## Call:
## glm(formula = target ~ zn + nox + age + dis + rad + tax + ptratio + 
##     medv, family = "binomial", data = train_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8295  -0.1752  -0.0021   0.0032   3.4191  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -37.415922   6.035013  -6.200 5.65e-10 ***
## zn           -0.068648   0.032019  -2.144  0.03203 *  
## nox          42.807768   6.678692   6.410 1.46e-10 ***
## age           0.032950   0.010951   3.009  0.00262 ** 
## dis           0.654896   0.214050   3.060  0.00222 ** 
## rad           0.725109   0.149788   4.841 1.29e-06 ***
## tax          -0.007756   0.002653  -2.924  0.00346 ** 
## ptratio       0.323628   0.111390   2.905  0.00367 ** 
## medv          0.110472   0.035445   3.117  0.00183 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 197.32  on 457  degrees of freedom
## AIC: 215.32
## 
## Number of Fisher Scoring iterations: 9
matrix(model_1)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 41  4
##          1  6 41
##                                           
##                Accuracy : 0.8913          
##                  95% CI : (0.8092, 0.9466)
##     No Information Rate : 0.5109          
##     P-Value [Acc > NIR] : 7.782e-15       
##                                           
##                   Kappa : 0.7827          
##  Mcnemar's Test P-Value : 0.7518          
##                                           
##             Sensitivity : 0.8723          
##             Specificity : 0.9111          
##          Pos Pred Value : 0.9111          
##          Neg Pred Value : 0.8723          
##              Prevalence : 0.5109          
##          Detection Rate : 0.4457          
##    Detection Prevalence : 0.4891          
##       Balanced Accuracy : 0.8917          
##                                           
##        'Positive' Class : 0               
## 
pR2(model_1)
##          llh      llhNull           G2     McFadden         r2ML 
##  -98.6614264 -322.9379132  448.5529737    0.6944879    0.6180861 
##         r2CU 
##    0.8241958

Check model_1 for the following logistic regression assumptions:

  1. The outcome is a binary (True)
  2. There is a linear relationship between the logit of the outcome and each predictor variables (If not, model can benefit from variables transformations)
  3. There is no influential values (extreme values or outliers) in the continuous predictors.
  4. There is no high intercorrelations (i.e. multicollinearity) among the predictors.

Checking for a linear relationship between the logit of the outcome and each predictor variables

probabilities <- predict(model_1, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
# Select only numeric predictors
mydata <- train_df %>%
  dplyr::select_if(is.numeric) 
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata <- mydata %>%
  mutate(logit = log(probabilities/(1-probabilities))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)

ggplot(mydata, aes(logit, predictor.value))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") + 
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_y")

The relationships are not linear, model can benefit from varibles transformations.

Checking model_1 for the presence of influencial(leverage) values

plot(model_1, which = 4, id.n = 5)

# Extract model results
model.data <- augment(model_1) %>% 
  mutate(index = 1:n()) 
model.data %>% top_n(5, .cooksd)
## # A tibble: 5 x 17
##   target    zn   nox   age   dis   rad   tax ptratio  medv .fitted .se.fit
##   <fct>  <dbl> <dbl> <dbl> <dbl> <int> <int>   <dbl> <dbl>   <dbl>   <dbl>
## 1 1         22 0.431   8.4  8.91     7   330    19.1  42.8  -0.941   0.970
## 2 1          0 0.544  37.8  2.52     4   304    18.4  16.1  -2.96    0.706
## 3 1         22 0.431  34.9  8.06     7   330    19.1  24.3  -2.67    0.652
## 4 1         20 0.464  42.1  4.43     3   223    18.6  21.1  -5.84    0.936
## 5 1          0 0.489   9.8  3.59     4   277    18.6  23.7  -4.42    0.832
## # ... with 6 more variables: .resid <dbl>, .hat <dbl>, .sigma <dbl>,
## #   .cooksd <dbl>, .std.resid <dbl>, index <int>
ggplot(model.data, aes(index, .std.resid)) + 
  geom_point(aes(color = target), alpha = .5) +
  theme_bw()

model.data %>% 
  filter(abs(.std.resid) > 3)
## # A tibble: 1 x 17
##   target    zn   nox   age   dis   rad   tax ptratio  medv .fitted .se.fit
##   <fct>  <dbl> <dbl> <dbl> <dbl> <int> <int>   <dbl> <dbl>   <dbl>   <dbl>
## 1 1         20 0.464  42.1  4.43     3   223    18.6  21.1   -5.84   0.936
## # ... with 6 more variables: .resid <dbl>, .hat <dbl>, .sigma <dbl>,
## #   .cooksd <dbl>, .std.resid <dbl>, index <int>

Eliminating the row with influential value.

train_df_clean <-train_df %>% 
  filter(!(nox==0.464 & age==42.1))

Checking for intercorrelations.

vif(model_1)
##       zn      nox      age      dis      rad      tax  ptratio     medv 
## 1.789037 3.172660 1.701974 3.595939 1.697110 1.754274 1.865085 2.193689

There is no significant multicollinearity detected in model_1.

MODEL 2

building a model based on a dateset with eliminated influential values

model_2<- step(glm(target~., data = train_df_clean, family = 'binomial'), direction = 'backward')
## Start:  AIC=204.95
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv
## 
##           Df Deviance    AIC
## - lstat    1   179.53 203.53
## - rm       1   179.86 203.86
## - chas     1   180.40 204.40
## <none>         178.95 204.95
## - indus    1   181.26 205.26
## - tax      1   182.93 206.93
## - zn       1   186.28 210.28
## - age      1   187.56 211.56
## - medv     1   188.43 212.43
## - ptratio  1   190.95 214.95
## - dis      1   194.36 218.36
## - rad      1   221.84 245.84
## - nox      1   258.08 282.08
## 
## Step:  AIC=203.53
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + medv
## 
##           Df Deviance    AIC
## - chas     1   181.28 203.28
## - rm       1   181.38 203.38
## <none>         179.53 203.53
## - indus    1   181.76 203.76
## - tax      1   183.26 205.26
## - zn       1   186.46 208.46
## - medv     1   189.16 211.16
## - ptratio  1   192.50 214.50
## - age      1   192.97 214.97
## - dis      1   195.50 217.50
## - rad      1   222.50 244.50
## - nox      1   259.97 281.97
## 
## Step:  AIC=203.28
## target ~ zn + indus + nox + rm + age + dis + rad + tax + ptratio + 
##     medv
## 
##           Df Deviance    AIC
## - indus    1   182.79 202.79
## <none>         181.28 203.28
## - rm       1   183.38 203.38
## - tax      1   186.60 206.60
## - zn       1   189.44 209.44
## - medv     1   191.08 211.08
## - ptratio  1   193.09 213.09
## - age      1   195.88 215.88
## - dis      1   196.73 216.73
## - rad      1   232.03 252.03
## - nox      1   260.00 280.00
## 
## Step:  AIC=202.79
## target ~ zn + nox + rm + age + dis + rad + tax + ptratio + medv
## 
##           Df Deviance    AIC
## - rm       1   184.66 202.66
## <none>         182.79 202.79
## - zn       1   191.25 209.25
## - medv     1   192.17 210.17
## - tax      1   192.67 210.67
## - ptratio  1   194.12 212.12
## - age      1   196.72 214.72
## - dis      1   197.96 215.96
## - rad      1   240.79 258.79
## - nox      1   266.03 284.03
## 
## Step:  AIC=202.66
## target ~ zn + nox + age + dis + rad + tax + ptratio + medv
## 
##           Df Deviance    AIC
## <none>         184.66 202.66
## - zn       1   193.60 209.60
## - ptratio  1   194.15 210.15
## - tax      1   194.85 210.85
## - age      1   196.83 212.83
## - dis      1   198.25 214.25
## - medv     1   198.43 214.43
## - rad      1   240.96 256.96
## - nox      1   266.17 282.17
summary(model_2)
## 
## Call:
## glm(formula = target ~ zn + nox + age + dis + rad + tax + ptratio + 
##     medv, family = "binomial", data = train_df_clean)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8555  -0.1501  -0.0006   0.0014   3.1726  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -42.128250   6.730139  -6.260 3.86e-10 ***
## zn           -0.093902   0.036896  -2.545 0.010927 *  
## nox          47.388109   7.340291   6.456 1.08e-10 ***
## age           0.038718   0.011711   3.306 0.000946 ***
## dis           0.807838   0.235713   3.427 0.000610 ***
## rad           0.806479   0.161555   4.992 5.98e-07 ***
## tax          -0.007945   0.002739  -2.900 0.003728 ** 
## ptratio       0.350885   0.117755   2.980 0.002884 ** 
## medv          0.130814   0.038521   3.396 0.000684 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 644.45  on 464  degrees of freedom
## Residual deviance: 184.66  on 456  degrees of freedom
## AIC: 202.66
## 
## Number of Fisher Scoring iterations: 9
vif(model_2)
##       zn      nox      age      dis      rad      tax  ptratio     medv 
## 1.960798 3.435929 1.773456 4.001177 1.770018 1.759468 1.985662 2.336037
matrix(model_2)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 41  5
##          1  6 40
##                                           
##                Accuracy : 0.8804          
##                  95% CI : (0.7961, 0.9388)
##     No Information Rate : 0.5109          
##     P-Value [Acc > NIR] : 5.644e-14       
##                                           
##                   Kappa : 0.7609          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8723          
##             Specificity : 0.8889          
##          Pos Pred Value : 0.8913          
##          Neg Pred Value : 0.8696          
##              Prevalence : 0.5109          
##          Detection Rate : 0.4457          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.8806          
##                                           
##        'Positive' Class : 0               
## 
pR2(model_2)
##          llh      llhNull           G2     McFadden         r2ML 
##  -92.3291377 -322.2263367  459.7943981    0.7134650    0.6279791 
##         r2CU 
##    0.8374100

MODEL 3

Applying BoxCox variables transformations

model_3 = train(target ~., data = train_df_clean, 
              method = "glm", family = "binomial",
              preProcess = c("BoxCox"),trControl = trainControl(
                  method = "cv", number = 10,
                  savePredictions = TRUE))

#  variables that were affected by BoxCox
                  
process <- preProcess(select(train_df_clean, -target),
                        method = c("BoxCox"))
process$method
## $BoxCox
##  [1] "indus"   "nox"     "rm"      "age"     "dis"     "rad"     "tax"    
##  [8] "ptratio" "lstat"   "medv"   
## 
## $ignore
## [1] "chas"
summary(model_3)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.99963  -0.08835  -0.00040   0.09534   3.14474  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  15.972742  41.521252   0.385 0.700468    
## zn           -0.033415   0.030692  -1.089 0.276283    
## indus        -0.036839   0.232913  -0.158 0.874324    
## chas1         1.029523   0.808347   1.274 0.202800    
## nox          15.506450   2.431033   6.379 1.79e-10 ***
## rm           -2.733095   2.958518  -0.924 0.355587    
## age           0.013945   0.004176   3.339 0.000841 ***
## dis           3.841600   0.928034   4.140 3.48e-05 ***
## rad           3.400070   0.806710   4.215 2.50e-05 ***
## tax         -13.823020  22.144986  -0.624 0.532493    
## ptratio       0.027592   0.007587   3.637 0.000276 ***
## lstat        -0.104096   0.464218  -0.224 0.822570    
## medv          2.763687   0.914536   3.022 0.002511 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 644.45  on 464  degrees of freedom
## Residual deviance: 184.24  on 452  degrees of freedom
## AIC: 210.24
## 
## Number of Fisher Scoring iterations: 8
vif(model_3$finalModel)
##       zn    indus    chas1      nox       rm      age      dis      rad 
## 1.498521 2.989495 1.321736 4.017031 4.581452 2.700827 3.832244 2.253491 
##      tax  ptratio    lstat     medv 
## 2.955169 2.660317 3.740465 8.062733
probabilities <- model_3 %>% predict(test.data, type = "prob")
predicted.classes <- ifelse((probabilities[,2]) > 0.5, 1, 0)

confusionMatrix(as.factor(predicted.classes), as.factor(test.data$target), dnn = c("Prediction", "Reference"))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 41  4
##          1  6 41
##                                           
##                Accuracy : 0.8913          
##                  95% CI : (0.8092, 0.9466)
##     No Information Rate : 0.5109          
##     P-Value [Acc > NIR] : 7.782e-15       
##                                           
##                   Kappa : 0.7827          
##  Mcnemar's Test P-Value : 0.7518          
##                                           
##             Sensitivity : 0.8723          
##             Specificity : 0.9111          
##          Pos Pred Value : 0.9111          
##          Neg Pred Value : 0.8723          
##              Prevalence : 0.5109          
##          Detection Rate : 0.4457          
##    Detection Prevalence : 0.4891          
##       Balanced Accuracy : 0.8917          
##                                           
##        'Positive' Class : 0               
## 

MODEL 4

Based on important variables.

Selecting important variables using caret package

# prepare training scheme
control <- trainControl(method="repeatedcv", number=10, repeats=3)
# train the model
model <- train(target~., data=train_df, method="glm",  trControl=control)
# estimate variable importance
importance <- varImp(model, scale=FALSE)
# summarize importance
print(importance)
## glm variable importance
## 
##         Overall
## nox      6.1932
## rad      4.0843
## dis      3.2077
## ptratio  3.1791
## medv     2.6477
## age      2.4749
## tax      2.0887
## zn       1.9029
## indus    1.3568
## chas1    1.2054
## lstat    0.8486
## rm       0.8127
# plot importance
plot(importance)

model_4<- glm(target~., data = train_df %>% select(-lstat, -rm), family = 'binomial')
summary(model_4)
## 
## Call:
## glm(formula = target ~ ., family = "binomial", data = train_df %>% 
##     select(-lstat, -rm))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8087  -0.1791  -0.0025   0.0041   3.4569  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -40.451410   6.412534  -6.308 2.82e-10 ***
## zn           -0.063334   0.032883  -1.926  0.05410 .  
## indus        -0.059204   0.047258  -1.253  0.21028    
## chas1         1.089782   0.773974   1.408  0.15912    
## nox          47.790906   7.704999   6.203 5.55e-10 ***
## age           0.032699   0.011119   2.941  0.00327 ** 
## dis           0.691906   0.218023   3.174  0.00151 ** 
## rad           0.631833   0.155927   4.052 5.08e-05 ***
## tax          -0.006063   0.002927  -2.072  0.03831 *  
## ptratio       0.356879   0.113638   3.140  0.00169 ** 
## medv          0.111851   0.035310   3.168  0.00154 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 194.32  on 455  degrees of freedom
## AIC: 216.32
## 
## Number of Fisher Scoring iterations: 9
vif(model_4)
##       zn    indus     chas      nox      age      dis      rad      tax 
## 1.740018 2.636065 1.188059 3.995876 1.711978 3.537228 1.815751 2.082398 
##  ptratio     medv 
## 1.863116 2.219497
matrix(model_4)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 43  4
##          1  4 41
##                                           
##                Accuracy : 0.913           
##                  95% CI : (0.8358, 0.9617)
##     No Information Rate : 0.5109          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.826           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9149          
##             Specificity : 0.9111          
##          Pos Pred Value : 0.9149          
##          Neg Pred Value : 0.9111          
##              Prevalence : 0.5109          
##          Detection Rate : 0.4674          
##    Detection Prevalence : 0.5109          
##       Balanced Accuracy : 0.9130          
##                                           
##        'Positive' Class : 0               
## 
pR2(model_4)
##          llh      llhNull           G2     McFadden         r2ML 
##  -97.1623981 -322.9379132  451.5510303    0.6991298    0.6205353 
##         r2CU 
##    0.8274617

MODEL 5

Based on the lowest Akaike information criterion (AIC). MASS package is used.

# Fit the model
model_5 <- glm(target ~., data = train.data, family = binomial) %>%
  stepAIC(trace = FALSE)
# Summarize the final selected model
summary(model_5)
## 
## Call:
## glm(formula = target ~ zn + nox + age + dis + rad + tax + ptratio + 
##     medv, family = binomial, data = train.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8476  -0.1617  -0.0019   0.0047   3.3501  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -35.267187   6.652404  -5.301 1.15e-07 ***
## zn           -0.070513   0.036909  -1.910  0.05607 .  
## nox          42.743396   7.365107   5.803 6.49e-09 ***
## age           0.031482   0.012618   2.495  0.01260 *  
## dis           0.607943   0.245430   2.477  0.01325 *  
## rad           0.660875   0.163688   4.037 5.40e-05 ***
## tax          -0.008578   0.003269  -2.624  0.00869 ** 
## ptratio       0.273859   0.122210   2.241  0.02503 *  
## medv          0.097398   0.039693   2.454  0.01414 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 518.38  on 373  degrees of freedom
## Residual deviance: 157.61  on 365  degrees of freedom
## AIC: 175.61
## 
## Number of Fisher Scoring iterations: 9
matrix(model_5)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 41  4
##          1  6 41
##                                           
##                Accuracy : 0.8913          
##                  95% CI : (0.8092, 0.9466)
##     No Information Rate : 0.5109          
##     P-Value [Acc > NIR] : 7.782e-15       
##                                           
##                   Kappa : 0.7827          
##  Mcnemar's Test P-Value : 0.7518          
##                                           
##             Sensitivity : 0.8723          
##             Specificity : 0.9111          
##          Pos Pred Value : 0.9111          
##          Neg Pred Value : 0.8723          
##              Prevalence : 0.5109          
##          Detection Rate : 0.4457          
##    Detection Prevalence : 0.4891          
##       Balanced Accuracy : 0.8917          
##                                           
##        'Positive' Class : 0               
## 
pR2(model_5)
##          llh      llhNull           G2     McFadden         r2ML 
##  -78.8030913 -259.1889151  360.7716477    0.6959627    0.6188758 
##         r2CU 
##    0.8252386

MODEL 6

using SVM with a nonlinear kernel to compare with logistic regression

model_6<-train(target ~., data = train_df, 
              method = "svmRadial",
              # preProcess = c("center", "scale"),
              trControl = trainControl(method = "cv", number = 10, savePredictions = TRUE))

predicted.classes<-model_6 %>% predict(test.data)
mean(predicted.classes==test.data$target)
## [1] 0.9456522