# load packages
if(!require(pacman)){install.packages("pacman"); require(pacman)}
## Loading required package: pacman
## Warning: package 'pacman' was built under R version 3.6.2
packages <- c('tidyverse', 'glue', 'broom', 'MASS', 'caret', 'InformationValue', 'Hmisc', 'kableExtra', 'corrplot', 'ROCR')
pacman::p_load(char = packages)
# get data
dfTrain <- read_csv("crime-training-data_modified.csv")
## Parsed with column specification:
## cols(
## zn = col_double(),
## indus = col_double(),
## chas = col_double(),
## nox = col_double(),
## rm = col_double(),
## age = col_double(),
## dis = col_double(),
## rad = col_double(),
## tax = col_double(),
## ptratio = col_double(),
## lstat = col_double(),
## medv = col_double(),
## target = col_double()
## )
dfEval <- read_csv("crime-evaluation-data_modified.csv")
## Parsed with column specification:
## cols(
## zn = col_double(),
## indus = col_double(),
## chas = col_double(),
## nox = col_double(),
## rm = col_double(),
## age = col_double(),
## dis = col_double(),
## rad = col_double(),
## tax = col_double(),
## ptratio = col_double(),
## lstat = col_double(),
## medv = col_double()
## )
dim(dfTrain)
## [1] 466 13
head(dfTrain)
## # A tibble: 6 x 13
## zn indus chas nox rm age dis rad tax ptratio lstat medv
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 19.6 0 0.605 7.93 96.2 2.05 5 403 14.7 3.7 50
## 2 0 19.6 1 0.871 5.40 100 1.32 5 403 14.7 26.8 13.4
## 3 0 18.1 0 0.74 6.48 100 1.98 24 666 20.2 18.8 15.4
## 4 30 4.93 0 0.428 6.39 7.8 7.04 6 300 16.6 5.19 23.7
## 5 0 2.46 0 0.488 7.16 92.2 2.70 3 193 17.8 4.82 37.9
## 6 0 8.56 0 0.52 6.78 71.3 2.86 5 384 20.9 7.67 26.5
## # ... with 1 more variable: target <dbl>
summary(dfTrain)
## zn indus chas nox
## Min. : 0.00 Min. : 0.460 Min. :0.00000 Min. :0.3890
## 1st Qu.: 0.00 1st Qu.: 5.145 1st Qu.:0.00000 1st Qu.:0.4480
## Median : 0.00 Median : 9.690 Median :0.00000 Median :0.5380
## Mean : 11.58 Mean :11.105 Mean :0.07082 Mean :0.5543
## 3rd Qu.: 16.25 3rd Qu.:18.100 3rd Qu.:0.00000 3rd Qu.:0.6240
## Max. :100.00 Max. :27.740 Max. :1.00000 Max. :0.8710
## rm age dis rad
## Min. :3.863 Min. : 2.90 Min. : 1.130 Min. : 1.00
## 1st Qu.:5.887 1st Qu.: 43.88 1st Qu.: 2.101 1st Qu.: 4.00
## Median :6.210 Median : 77.15 Median : 3.191 Median : 5.00
## Mean :6.291 Mean : 68.37 Mean : 3.796 Mean : 9.53
## 3rd Qu.:6.630 3rd Qu.: 94.10 3rd Qu.: 5.215 3rd Qu.:24.00
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.00
## tax ptratio lstat medv
## Min. :187.0 Min. :12.6 Min. : 1.730 Min. : 5.00
## 1st Qu.:281.0 1st Qu.:16.9 1st Qu.: 7.043 1st Qu.:17.02
## Median :334.5 Median :18.9 Median :11.350 Median :21.20
## Mean :409.5 Mean :18.4 Mean :12.631 Mean :22.59
## 3rd Qu.:666.0 3rd Qu.:20.2 3rd Qu.:16.930 3rd Qu.:25.00
## Max. :711.0 Max. :22.0 Max. :37.970 Max. :50.00
## target
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4914
## 3rd Qu.:1.0000
## Max. :1.0000
str(dfTrain)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 466 obs. of 13 variables:
## $ zn : num 0 0 0 30 0 0 0 0 0 80 ...
## $ indus : num 19.58 19.58 18.1 4.93 2.46 ...
## $ chas : num 0 1 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.605 0.871 0.74 0.428 0.488 0.52 0.693 0.693 0.515 0.392 ...
## $ rm : num 7.93 5.4 6.49 6.39 7.16 ...
## $ age : num 96.2 100 100 7.8 92.2 71.3 100 100 38.1 19.1 ...
## $ dis : num 2.05 1.32 1.98 7.04 2.7 ...
## $ rad : num 5 5 24 6 3 5 24 24 5 1 ...
## $ tax : num 403 403 666 300 193 384 666 666 224 315 ...
## $ ptratio: num 14.7 14.7 20.2 16.6 17.8 20.9 20.2 20.2 20.2 16.4 ...
## $ lstat : num 3.7 26.82 18.85 5.19 4.82 ...
## $ medv : num 50 13.4 15.4 23.7 37.9 26.5 5 7 22.2 20.9 ...
## $ target : num 1 1 1 0 0 0 1 1 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. zn = col_double(),
## .. indus = col_double(),
## .. chas = col_double(),
## .. nox = col_double(),
## .. rm = col_double(),
## .. age = col_double(),
## .. dis = col_double(),
## .. rad = col_double(),
## .. tax = col_double(),
## .. ptratio = col_double(),
## .. lstat = col_double(),
## .. medv = col_double(),
## .. target = col_double()
## .. )
# number of unique values for each variable
sapply(dfTrain, function(x) unique(x) %>% length)
## zn indus chas nox rm age dis rad tax ptratio
## 26 73 2 79 419 333 380 9 63 46
## lstat medv target
## 424 218 2
# any missing
colSums(is.na(dfTrain))
## zn indus chas nox rm age dis rad tax ptratio
## 0 0 0 0 0 0 0 0 0 0
## lstat medv target
## 0 0 0
# let's look at distribution of each variable by target variable
categorical_vars <- c("chas", "rad")
quantitative_vars <- names(dfTrain)[names(dfTrain) %nin% categorical_vars]
dfGather <- dfTrain %>%
dplyr::select(all_of(quantitative_vars)) %>%
dplyr::mutate(target = as.factor(target)) %>%
tidyr::gather(key, value, -target)
ggplot(dfGather, aes(value, color = target)) +
geom_density() +
geom_vline(data = aggregate(value ~ target + key, dfGather, median),
aes(xintercept = value,
color = target),
linetype = "dashed") +
facet_wrap(~ key, nrow = 5, scales = "free")
# let's look at other categorical variables
with(dfTrain, ftable(target) %>% prop.table()) %>% round(., 2)
## target 0 1
##
## 0.51 0.49
with(dfTrain, ftable(target, chas) %>% prop.table(., margin = 2) %>% round(., 2))
## chas 0 1
## target
## 0 0.52 0.36
## 1 0.48 0.64
with(dfTrain, ftable(target, rad) %>% prop.table(., margin = 2) %>% round(., 2))
## rad 1 2 3 4 5 6 7 8 24
## target
## 0 1.00 1.00 0.97 0.57 0.61 0.92 0.87 0.20 0.00
## 1 0.00 0.00 0.03 0.43 0.39 0.08 0.13 0.80 1.00
*** Good news is that we do not have any missing data. The proportion for the target variable is almost the same between 0 and 1. The ggplot and crosstab tables indicate that majority of these variables contribute to the differences for the target variable. ‘rm’ (average number of rooms per dwelling) alone is probably the only variable that is definitely NOT related to the target variable. The dashline inside each subplot is the median (not the mean) for dealing with skewed distribution. Next, we should look at the correlation among all variables by using a corrplot.
dfTrain %>%
cor() %>%
corrplot(method = "number", type = "upper", order = "hclust")
*** ‘tax’ and ‘rad’ are strongly positively correlated with each other, whereas ‘nox’ and ‘dis’ are moderately negatively correlated. The most interesting part would be ‘target’ and ‘dis’ are moderately negatively correlated. Would that because the further apart between a neighborhood and an employment center, the less likely residents would need help for seeking a job (because they already have ones or they have better white collar jobs), thus less crimes happen?
# split dfTrain into train and test sets
set.seed(1234)
index <- sample(1:nrow(dfTrain), nrow(dfTrain) * 0.7)
train <- dfTrain[index, ]
test <- dfTrain[-index, ]
# model 1 - full model
fullModel <- glm(target ~ ., data = train, family = binomial(link = "logit"))
broom::glance(fullModel)
## # A tibble: 1 x 7
## null.deviance df.null logLik AIC BIC deviance df.residual
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
## 1 452. 325 -67.9 162. 211. 136. 313
# model 2 - sqrt transformation for taking care of outliners
DV <- names(dfTrain)[names(train) %nin% c(categorical_vars, 'target')]
fullModelSqrt <- glm(target ~ ., data = train %>% dplyr::mutate_at(vars(DV), sqrt), family = binomial(link = "logit"))
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(DV)` instead of `DV` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
broom::glance(fullModelSqrt)
## # A tibble: 1 x 7
## null.deviance df.null logLik AIC BIC deviance df.residual
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
## 1 452. 325 -69.6 165. 214. 139. 313
# model 3 - stepwise (direction default to both)
step <- MASS::stepAIC(fullModel, trace = FALSE)
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
## ptratio + lstat + medv
##
## Final Model:
## target ~ zn + nox + dis + rad + tax + ptratio + lstat + medv
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 313 135.8165 161.8165
## 2 - rm 1 0.01926505 314 135.8357 159.8357
## 3 - chas 1 0.14981839 315 135.9856 157.9856
## 4 - indus 1 0.90705213 316 136.8926 156.8926
## 5 - age 1 1.26913328 317 138.1618 156.1618
stepModel <- glm(target ~ zn + nox + dis + rad + tax + ptratio + lstat + medv, data = train, family = binomial(link = "logit"))
broom::glance(stepModel)
## # A tibble: 1 x 7
## null.deviance df.null logLik AIC BIC deviance df.residual
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
## 1 452. 325 -69.1 156. 190. 138. 317
# model 4 - repeated, k-fold cross-validation
set.seed(1234)
train.control <- caret::trainControl(method = "repeatedcv", number = 10, repeats = 10)
index <- sample(1:nrow(dfTrain), size = nrow(dfTrain) * 0.7)
kFoldModel <- caret::train(as.factor(target) ~ zn + nox + dis + rad + tax + ptratio + lstat + medv,
data = train, method = "glm", trControl = train.control)
summary(kFoldModel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2871 -0.1966 -0.0018 0.0031 3.1962
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -34.838404 6.599061 -5.279 1.30e-07 ***
## zn -0.083238 0.040479 -2.056 0.03975 *
## nox 43.080971 7.779899 5.537 3.07e-08 ***
## dis 0.536634 0.247813 2.165 0.03035 *
## rad 0.719849 0.172935 4.163 3.15e-05 ***
## tax -0.007869 0.003530 -2.229 0.02580 *
## ptratio 0.256700 0.124271 2.066 0.03886 *
## lstat 0.083631 0.053347 1.568 0.11696
## medv 0.131206 0.049573 2.647 0.00813 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 451.82 on 325 degrees of freedom
## Residual deviance: 138.16 on 317 degrees of freedom
## AIC: 156.16
##
## Number of Fisher Scoring iterations: 9
*** We have built 4 logistics regression models using different approaches. First, we built a full model using an entire data set. Second, we applied a square root transformation on all continuous predictors to see if there’s any improvement. Third, we applied stepwise approach and looked at both directions (forward, backward). Finally, after we came up with a final model from the stepwise approach, we applied it with kfold cross validation. We built a 10-fold, repeated (10 times) cross validation using the formula from the stepwise model. We compared across the four models using Akaike Information Criterion (AIC). We finally settled with the formula proposed by the stepwise approach (target ~ zn + nox + age + dis + rad + tax + ptratio + medv) and tried it again with the kfold cross validation. We acheived the lowest AIC 156.16 (brought down from the full model of 161.8) and the result looks reasonable. For example, ‘rm’ should be removed because the average number of rooms per dwelling doesn’t seem to make any logical indication to the target variable (crime rate). Likewise, the stepwise approach droppped other variables that do not make sense (such as whether the suburb borders at the Charles River or proportion of non-retail business acres per suburb). Let’s turn to further evaluation of each model before applying the model on our evaluaton data set for prediction.
# set threshold
optCutOff <- 0.5
# prediction for each model
predicted_1 <- list(predict(fullModel, test, type = "response"))
predicted_2 <- list(predict(fullModelSqrt, test, type = "response"))
predicted_3 <- list(predict(stepModel, test, type = "response"))
predicted_4 <- list(predict(kFoldModel, test, type = "raw"))
predictions <- list(predicted_1, predicted_2, predicted_3, predicted_4)
# diagnostic - Confusion Matrix, i.e. the columns are actuals, while rows are predicteds
cm_1 <- InformationValue::confusionMatrix(test$target, predicted = predicted_1 %>% unlist, threshold = optCutOff)
cm_2 <- InformationValue::confusionMatrix(test$target, predicted = predicted_2 %>% unlist, threshold = optCutOff)
cm_3 <- InformationValue::confusionMatrix(test$target, predicted = predicted_3 %>% unlist, threshold = optCutOff)
cm_4 <- ftable(predicted_4 %>% unlist, test$target)
cm_1
## 0 1
## 0 63 8
## 1 8 61
cm_2
## 0 1
## 0 32 47
## 1 39 22
cm_3
## 0 1
## 0 62 12
## 1 9 57
cm_4
## 0 1
##
## 0 62 12
## 1 9 57
# diagnostic - AUROC, misClassification, precision, sensitivity, specificity
modelResultList <- vector(mode = "list", length = 3)
names(modelResultList) <- c("full", "full - sqrt transformed", "stepwise")
for(i in 1:length(modelResultList)){
# metrics
auroc <- InformationValue::AUROC(test$target, predictions[[i]] %>% unlist)
misCE <- InformationValue::misClassError(test$target, predictions[[i]] %>% unlist, threshold = optCutOff)
pre <- InformationValue::precision(test$target, predictions[[i]] %>% unlist, threshold = optCutOff)
sen <- InformationValue::sensitivity(test$target, predictions[[i]] %>% unlist, threshold = optCutOff)
spe <- InformationValue::specificity(test$target, predictions[[i]] %>% unlist, threshold = optCutOff)
cm <- InformationValue::confusionMatrix(test$target, predictions[[i]] %>% unlist, threshold = optCutOff)
TN <- cm[1, 1]
FN <- cm[1, 2]
FP <- cm[2, 1]
TP <- cm[2, 2]
# diagnostic output
tempList <- list(
data.frame(model = names(modelResultList)[i],
auroc = round(auroc, 2),
misClass = round(misCE, 2),
precision = round(pre, 2),
sensitivity = round(sen, 2),
specificity = round(spe, 2),
`True Positive` = TP,
`True Negative` = TN,
`False Positive` = FP,
`False Negative` = FN,
accuracy = round((TP + TN) / (TP + TN + FP + FN), 2),
F1 = round(2 * ((pre * sen) / (pre + sen)), 2))
)
# put together
modelResultList <- c(modelResultList, tempList)
}
modelResultDf <- modelResultList %>% dplyr::bind_rows()
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector, coercing
## into character vector
modelResultDf %>% kableExtra::kable()
| model | auroc | misClass | precision | sensitivity | specificity | True.Positive | True.Negative | False.Positive | False.Negative | accuracy | F1 |
|---|---|---|---|---|---|---|---|---|---|---|---|
| full | 0.97 | 0.11 | 0.88 | 0.88 | 0.89 | 61 | 63 | 8 | 8 | 0.89 | 0.88 |
| full - sqrt transformed | 0.35 | 0.61 | 0.36 | 0.32 | 0.45 | 22 | 32 | 39 | 47 | 0.39 | 0.34 |
| stepwise | 0.96 | 0.15 | 0.86 | 0.83 | 0.87 | 57 | 62 | 9 | 12 | 0.85 | 0.84 |
# ROC Curve
par(mfrow = c(1, 3))
pred <- prediction(predicted_1 %>% unlist, test$target)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize = TRUE, main = "Full Model")
pred <- prediction(predicted_2 %>% unlist, test$target)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize = TRUE, main = "Full Model - Sqrt")
pred <- prediction(predicted_3 %>% unlist, test$target)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize = TRUE, main = "Step Model")
# Final print of the stepModel
tidy(stepModel)
## # A tibble: 9 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -34.8 6.60 -5.28 0.000000130
## 2 zn -0.0832 0.0405 -2.06 0.0397
## 3 nox 43.1 7.78 5.54 0.0000000307
## 4 dis 0.537 0.248 2.17 0.0304
## 5 rad 0.720 0.173 4.16 0.0000315
## 6 tax -0.00787 0.00353 -2.23 0.0258
## 7 ptratio 0.257 0.124 2.07 0.0389
## 8 lstat 0.0836 0.0533 1.57 0.117
## 9 medv 0.131 0.0496 2.65 0.00813
# Prediction for Evaluation Set
dfEval <- dfEval %>%
dplyr::mutate(prediction = predict(stepModel, dfEval, type = "response"),
prediction = ifelse(prediction > optCutOff, 1, 0))
table(dfEval$prediction)
##
## 0 1
## 18 22
*** Above results indicated that approach 3 and 4 (stepModel, kFoldModel) shared the identical confusion matrix. Essentially, they are not different, thus we removed it from the for() loop for comparison. Surprisingly, the fullModel is slightly better than the stepModel in terms of area under the ROC curve, less misclassification error, more sensitive and accurate and so forth. However, we decide to go with the stewpwise approach because the model is more elegant and trim off irrelevant variables and that became more parsimonious.