load packages, data

# load packages
if(!require(pacman)){install.packages("pacman"); require(pacman)}
## Loading required package: pacman
## Warning: package 'pacman' was built under R version 3.6.2
packages <- c('tidyverse', 'glue', 'broom', 'MASS', 'caret', 'InformationValue', 'Hmisc', 'kableExtra', 'corrplot', 'ROCR')
pacman::p_load(char = packages)

# get data
dfTrain <- read_csv("crime-training-data_modified.csv")
## Parsed with column specification:
## cols(
##   zn = col_double(),
##   indus = col_double(),
##   chas = col_double(),
##   nox = col_double(),
##   rm = col_double(),
##   age = col_double(),
##   dis = col_double(),
##   rad = col_double(),
##   tax = col_double(),
##   ptratio = col_double(),
##   lstat = col_double(),
##   medv = col_double(),
##   target = col_double()
## )
dfEval <- read_csv("crime-evaluation-data_modified.csv")
## Parsed with column specification:
## cols(
##   zn = col_double(),
##   indus = col_double(),
##   chas = col_double(),
##   nox = col_double(),
##   rm = col_double(),
##   age = col_double(),
##   dis = col_double(),
##   rad = col_double(),
##   tax = col_double(),
##   ptratio = col_double(),
##   lstat = col_double(),
##   medv = col_double()
## )

Part I & II : Data Exploration, Preparation

dim(dfTrain)
## [1] 466  13
head(dfTrain)
## # A tibble: 6 x 13
##      zn indus  chas   nox    rm   age   dis   rad   tax ptratio lstat  medv
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>
## 1     0 19.6      0 0.605  7.93  96.2  2.05     5   403    14.7  3.7   50  
## 2     0 19.6      1 0.871  5.40 100    1.32     5   403    14.7 26.8   13.4
## 3     0 18.1      0 0.74   6.48 100    1.98    24   666    20.2 18.8   15.4
## 4    30  4.93     0 0.428  6.39   7.8  7.04     6   300    16.6  5.19  23.7
## 5     0  2.46     0 0.488  7.16  92.2  2.70     3   193    17.8  4.82  37.9
## 6     0  8.56     0 0.52   6.78  71.3  2.86     5   384    20.9  7.67  26.5
## # ... with 1 more variable: target <dbl>
summary(dfTrain)
##        zn             indus             chas              nox        
##  Min.   :  0.00   Min.   : 0.460   Min.   :0.00000   Min.   :0.3890  
##  1st Qu.:  0.00   1st Qu.: 5.145   1st Qu.:0.00000   1st Qu.:0.4480  
##  Median :  0.00   Median : 9.690   Median :0.00000   Median :0.5380  
##  Mean   : 11.58   Mean   :11.105   Mean   :0.07082   Mean   :0.5543  
##  3rd Qu.: 16.25   3rd Qu.:18.100   3rd Qu.:0.00000   3rd Qu.:0.6240  
##  Max.   :100.00   Max.   :27.740   Max.   :1.00000   Max.   :0.8710  
##        rm             age              dis              rad       
##  Min.   :3.863   Min.   :  2.90   Min.   : 1.130   Min.   : 1.00  
##  1st Qu.:5.887   1st Qu.: 43.88   1st Qu.: 2.101   1st Qu.: 4.00  
##  Median :6.210   Median : 77.15   Median : 3.191   Median : 5.00  
##  Mean   :6.291   Mean   : 68.37   Mean   : 3.796   Mean   : 9.53  
##  3rd Qu.:6.630   3rd Qu.: 94.10   3rd Qu.: 5.215   3rd Qu.:24.00  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.00  
##       tax           ptratio         lstat             medv      
##  Min.   :187.0   Min.   :12.6   Min.   : 1.730   Min.   : 5.00  
##  1st Qu.:281.0   1st Qu.:16.9   1st Qu.: 7.043   1st Qu.:17.02  
##  Median :334.5   Median :18.9   Median :11.350   Median :21.20  
##  Mean   :409.5   Mean   :18.4   Mean   :12.631   Mean   :22.59  
##  3rd Qu.:666.0   3rd Qu.:20.2   3rd Qu.:16.930   3rd Qu.:25.00  
##  Max.   :711.0   Max.   :22.0   Max.   :37.970   Max.   :50.00  
##      target      
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.4914  
##  3rd Qu.:1.0000  
##  Max.   :1.0000
str(dfTrain)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 466 obs. of  13 variables:
##  $ zn     : num  0 0 0 30 0 0 0 0 0 80 ...
##  $ indus  : num  19.58 19.58 18.1 4.93 2.46 ...
##  $ chas   : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.605 0.871 0.74 0.428 0.488 0.52 0.693 0.693 0.515 0.392 ...
##  $ rm     : num  7.93 5.4 6.49 6.39 7.16 ...
##  $ age    : num  96.2 100 100 7.8 92.2 71.3 100 100 38.1 19.1 ...
##  $ dis    : num  2.05 1.32 1.98 7.04 2.7 ...
##  $ rad    : num  5 5 24 6 3 5 24 24 5 1 ...
##  $ tax    : num  403 403 666 300 193 384 666 666 224 315 ...
##  $ ptratio: num  14.7 14.7 20.2 16.6 17.8 20.9 20.2 20.2 20.2 16.4 ...
##  $ lstat  : num  3.7 26.82 18.85 5.19 4.82 ...
##  $ medv   : num  50 13.4 15.4 23.7 37.9 26.5 5 7 22.2 20.9 ...
##  $ target : num  1 1 1 0 0 0 1 1 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   zn = col_double(),
##   ..   indus = col_double(),
##   ..   chas = col_double(),
##   ..   nox = col_double(),
##   ..   rm = col_double(),
##   ..   age = col_double(),
##   ..   dis = col_double(),
##   ..   rad = col_double(),
##   ..   tax = col_double(),
##   ..   ptratio = col_double(),
##   ..   lstat = col_double(),
##   ..   medv = col_double(),
##   ..   target = col_double()
##   .. )

# number of unique values for each variable
sapply(dfTrain, function(x) unique(x) %>% length)
##      zn   indus    chas     nox      rm     age     dis     rad     tax ptratio 
##      26      73       2      79     419     333     380       9      63      46 
##   lstat    medv  target 
##     424     218       2

# any missing
colSums(is.na(dfTrain))
##      zn   indus    chas     nox      rm     age     dis     rad     tax ptratio 
##       0       0       0       0       0       0       0       0       0       0 
##   lstat    medv  target 
##       0       0       0

# let's look at distribution of each variable by target variable
categorical_vars <- c("chas", "rad")
quantitative_vars <- names(dfTrain)[names(dfTrain) %nin% categorical_vars]
dfGather <- dfTrain %>%
        dplyr::select(all_of(quantitative_vars)) %>%
        dplyr::mutate(target = as.factor(target)) %>%
        tidyr::gather(key, value, -target)

ggplot(dfGather, aes(value, color = target)) +
        geom_density() +
        geom_vline(data = aggregate(value ~ target + key, dfGather, median), 
                   aes(xintercept = value,
                       color = target),
                   linetype = "dashed") +
        facet_wrap(~ key, nrow = 5, scales = "free")


# let's look at other categorical variables        
with(dfTrain, ftable(target) %>% prop.table()) %>% round(., 2)
## target    0    1
##                 
##        0.51 0.49
with(dfTrain, ftable(target, chas) %>% prop.table(., margin = 2) %>% round(., 2))
##        chas    0    1
## target               
## 0           0.52 0.36
## 1           0.48 0.64
with(dfTrain, ftable(target, rad) %>% prop.table(., margin = 2) %>% round(., 2))
##        rad    1    2    3    4    5    6    7    8   24
## target                                                 
## 0          1.00 1.00 0.97 0.57 0.61 0.92 0.87 0.20 0.00
## 1          0.00 0.00 0.03 0.43 0.39 0.08 0.13 0.80 1.00

*** Good news is that we do not have any missing data. The proportion for the target variable is almost the same between 0 and 1. The ggplot and crosstab tables indicate that majority of these variables contribute to the differences for the target variable. ‘rm’ (average number of rooms per dwelling) alone is probably the only variable that is definitely NOT related to the target variable. The dashline inside each subplot is the median (not the mean) for dealing with skewed distribution. Next, we should look at the correlation among all variables by using a corrplot.

dfTrain %>%
        cor() %>%
        corrplot(method = "number", type = "upper", order = "hclust")

*** ‘tax’ and ‘rad’ are strongly positively correlated with each other, whereas ‘nox’ and ‘dis’ are moderately negatively correlated. The most interesting part would be ‘target’ and ‘dis’ are moderately negatively correlated. Would that because the further apart between a neighborhood and an employment center, the less likely residents would need help for seeking a job (because they already have ones or they have better white collar jobs), thus less crimes happen?

Part III : Build Models

# split dfTrain into train and test sets
set.seed(1234)
index <- sample(1:nrow(dfTrain), nrow(dfTrain) * 0.7)
train <- dfTrain[index, ]
test <- dfTrain[-index, ]

# model 1 - full model
fullModel <- glm(target ~ ., data = train, family = binomial(link = "logit"))        
broom::glance(fullModel)
## # A tibble: 1 x 7
##   null.deviance df.null logLik   AIC   BIC deviance df.residual
##           <dbl>   <int>  <dbl> <dbl> <dbl>    <dbl>       <int>
## 1          452.     325  -67.9  162.  211.     136.         313

# model 2 - sqrt transformation for taking care of outliners
DV <- names(dfTrain)[names(train) %nin% c(categorical_vars, 'target')]
fullModelSqrt <- glm(target ~ ., data = train %>% dplyr::mutate_at(vars(DV), sqrt), family = binomial(link = "logit"))
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(DV)` instead of `DV` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
broom::glance(fullModelSqrt)
## # A tibble: 1 x 7
##   null.deviance df.null logLik   AIC   BIC deviance df.residual
##           <dbl>   <int>  <dbl> <dbl> <dbl>    <dbl>       <int>
## 1          452.     325  -69.6  165.  214.     139.         313

# model 3 - stepwise (direction default to both)
step <- MASS::stepAIC(fullModel, trace = FALSE)
step$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv
## 
## Final Model:
## target ~ zn + nox + dis + rad + tax + ptratio + lstat + medv
## 
## 
##      Step Df   Deviance Resid. Df Resid. Dev      AIC
## 1                             313   135.8165 161.8165
## 2    - rm  1 0.01926505       314   135.8357 159.8357
## 3  - chas  1 0.14981839       315   135.9856 157.9856
## 4 - indus  1 0.90705213       316   136.8926 156.8926
## 5   - age  1 1.26913328       317   138.1618 156.1618
stepModel <- glm(target ~ zn + nox + dis + rad + tax + ptratio + lstat + medv, data = train, family = binomial(link = "logit"))
broom::glance(stepModel)
## # A tibble: 1 x 7
##   null.deviance df.null logLik   AIC   BIC deviance df.residual
##           <dbl>   <int>  <dbl> <dbl> <dbl>    <dbl>       <int>
## 1          452.     325  -69.1  156.  190.     138.         317

# model 4 - repeated, k-fold cross-validation
set.seed(1234)
train.control <- caret::trainControl(method = "repeatedcv", number = 10, repeats = 10)        
index <- sample(1:nrow(dfTrain), size = nrow(dfTrain) * 0.7)
kFoldModel <- caret::train(as.factor(target) ~ zn + nox + dis + rad + tax + ptratio + lstat + medv,
                           data = train, method = "glm", trControl = train.control)        
summary(kFoldModel)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2871  -0.1966  -0.0018   0.0031   3.1962  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -34.838404   6.599061  -5.279 1.30e-07 ***
## zn           -0.083238   0.040479  -2.056  0.03975 *  
## nox          43.080971   7.779899   5.537 3.07e-08 ***
## dis           0.536634   0.247813   2.165  0.03035 *  
## rad           0.719849   0.172935   4.163 3.15e-05 ***
## tax          -0.007869   0.003530  -2.229  0.02580 *  
## ptratio       0.256700   0.124271   2.066  0.03886 *  
## lstat         0.083631   0.053347   1.568  0.11696    
## medv          0.131206   0.049573   2.647  0.00813 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 451.82  on 325  degrees of freedom
## Residual deviance: 138.16  on 317  degrees of freedom
## AIC: 156.16
## 
## Number of Fisher Scoring iterations: 9

*** We have built 4 logistics regression models using different approaches. First, we built a full model using an entire data set. Second, we applied a square root transformation on all continuous predictors to see if there’s any improvement. Third, we applied stepwise approach and looked at both directions (forward, backward). Finally, after we came up with a final model from the stepwise approach, we applied it with kfold cross validation. We built a 10-fold, repeated (10 times) cross validation using the formula from the stepwise model. We compared across the four models using Akaike Information Criterion (AIC). We finally settled with the formula proposed by the stepwise approach (target ~ zn + nox + age + dis + rad + tax + ptratio + medv) and tried it again with the kfold cross validation. We acheived the lowest AIC 156.16 (brought down from the full model of 161.8) and the result looks reasonable. For example, ‘rm’ should be removed because the average number of rooms per dwelling doesn’t seem to make any logical indication to the target variable (crime rate). Likewise, the stepwise approach droppped other variables that do not make sense (such as whether the suburb borders at the Charles River or proportion of non-retail business acres per suburb). Let’s turn to further evaluation of each model before applying the model on our evaluaton data set for prediction.

Part IV : Select Models

# set threshold
optCutOff <- 0.5

# prediction for each model
predicted_1 <- list(predict(fullModel, test, type = "response"))
predicted_2 <- list(predict(fullModelSqrt, test, type = "response"))
predicted_3 <- list(predict(stepModel, test, type = "response"))
predicted_4 <- list(predict(kFoldModel, test, type = "raw"))
predictions <- list(predicted_1, predicted_2, predicted_3, predicted_4)

# diagnostic - Confusion Matrix, i.e. the columns are actuals, while rows are predicteds
cm_1 <- InformationValue::confusionMatrix(test$target, predicted = predicted_1 %>% unlist, threshold = optCutOff)
cm_2 <- InformationValue::confusionMatrix(test$target, predicted = predicted_2 %>% unlist, threshold = optCutOff)
cm_3 <- InformationValue::confusionMatrix(test$target, predicted = predicted_3 %>% unlist, threshold = optCutOff)
cm_4 <- ftable(predicted_4 %>% unlist, test$target)

cm_1
##    0  1
## 0 63  8
## 1  8 61
cm_2
##    0  1
## 0 32 47
## 1 39 22
cm_3
##    0  1
## 0 62 12
## 1  9 57
cm_4
##     0  1
##         
## 0  62 12
## 1   9 57

# diagnostic - AUROC, misClassification, precision, sensitivity, specificity
modelResultList <- vector(mode = "list", length = 3)
names(modelResultList) <- c("full", "full - sqrt transformed", "stepwise")

for(i in 1:length(modelResultList)){
        
        # metrics
        auroc <- InformationValue::AUROC(test$target, predictions[[i]] %>% unlist)
        misCE <- InformationValue::misClassError(test$target, predictions[[i]] %>% unlist, threshold = optCutOff)
        pre <- InformationValue::precision(test$target, predictions[[i]] %>% unlist, threshold = optCutOff)
        sen <- InformationValue::sensitivity(test$target, predictions[[i]] %>% unlist, threshold = optCutOff)
        spe <- InformationValue::specificity(test$target, predictions[[i]] %>% unlist, threshold = optCutOff)
        cm <- InformationValue::confusionMatrix(test$target, predictions[[i]] %>% unlist, threshold = optCutOff)
        TN <- cm[1, 1]
        FN <- cm[1, 2]
        FP <- cm[2, 1]
        TP <- cm[2, 2]
        
        # diagnostic output
        tempList <- list(
                data.frame(model = names(modelResultList)[i],
                           auroc = round(auroc, 2),
                           misClass = round(misCE, 2),
                           precision = round(pre, 2),
                           sensitivity = round(sen, 2),
                           specificity = round(spe, 2),
                           `True Positive` = TP,
                           `True Negative` = TN,
                           `False Positive` = FP,
                           `False Negative` = FN,
                           accuracy = round((TP + TN) / (TP + TN + FP + FN), 2), 
                           F1 = round(2 * ((pre * sen) / (pre + sen)), 2))
        )

        # put together
        modelResultList <- c(modelResultList, tempList)
        
}

modelResultDf <- modelResultList %>% dplyr::bind_rows()
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
modelResultDf %>% kableExtra::kable()
model auroc misClass precision sensitivity specificity True.Positive True.Negative False.Positive False.Negative accuracy F1
full 0.97 0.11 0.88 0.88 0.89 61 63 8 8 0.89 0.88
full - sqrt transformed 0.35 0.61 0.36 0.32 0.45 22 32 39 47 0.39 0.34
stepwise 0.96 0.15 0.86 0.83 0.87 57 62 9 12 0.85 0.84

# ROC Curve
par(mfrow = c(1, 3))
pred <- prediction(predicted_1 %>% unlist, test$target)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize = TRUE, main = "Full Model")

pred <- prediction(predicted_2 %>% unlist, test$target)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize = TRUE, main = "Full Model - Sqrt")

pred <- prediction(predicted_3 %>% unlist, test$target)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize = TRUE, main = "Step Model")


# Final print of the stepModel
tidy(stepModel)
## # A tibble: 9 x 5
##   term         estimate std.error statistic      p.value
##   <chr>           <dbl>     <dbl>     <dbl>        <dbl>
## 1 (Intercept) -34.8       6.60        -5.28 0.000000130 
## 2 zn           -0.0832    0.0405      -2.06 0.0397      
## 3 nox          43.1       7.78         5.54 0.0000000307
## 4 dis           0.537     0.248        2.17 0.0304      
## 5 rad           0.720     0.173        4.16 0.0000315   
## 6 tax          -0.00787   0.00353     -2.23 0.0258      
## 7 ptratio       0.257     0.124        2.07 0.0389      
## 8 lstat         0.0836    0.0533       1.57 0.117       
## 9 medv          0.131     0.0496       2.65 0.00813

# Prediction for Evaluation Set
dfEval <- dfEval %>%
        dplyr::mutate(prediction = predict(stepModel, dfEval, type = "response"),
                      prediction = ifelse(prediction > optCutOff, 1, 0))
table(dfEval$prediction)
## 
##  0  1 
## 18 22

*** Above results indicated that approach 3 and 4 (stepModel, kFoldModel) shared the identical confusion matrix. Essentially, they are not different, thus we removed it from the for() loop for comparison. Surprisingly, the fullModel is slightly better than the stepModel in terms of area under the ROC curve, less misclassification error, more sensitive and accurate and so forth. However, we decide to go with the stewpwise approach because the model is more elegant and trim off irrelevant variables and that became more parsimonious.