library(data.table)
## Warning: package 'data.table' was built under R version 3.3.2
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 3.3.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.2
library(pROC)
## Warning: package 'pROC' was built under R version 3.3.2
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
#please connect to wifi
crimeeval <- "https://raw.githubusercontent.com/Misterresearch/CUNY-Projects/master/crime-evaluation-data.csv"

crimetraining <-"https://raw.githubusercontent.com/Misterresearch/CUNY-Projects/master/crime-training-data.csv"
  
  
crimeeval <- read.table(file = crimeeval, header = TRUE, sep = ",", na.strings = "NA")

crimetraining <- read.table(file = crimetraining, header = TRUE, sep = ",", na.strings = "NA")

DATA EXPLORATION

For this study, we’ve been provided with a two-way random partition of evaluation and training data sets. The evaluation/test data is not used for our predictive model. However, we will analyze the evaluation set’s mean “yes” classification (prevalence), and compare it against the training data set once our probabilities and classifications have been created.

However, first we will check for any missing values in our training data set. The check below reveals that there are no missing data, and as a result we do not need to develop methods for imputation going forward.

#NA check, no imputation required
sum(is.na(crimetraining$zn))
## [1] 0
sum(is.na(crimetraining$indus))
## [1] 0
sum(is.na(crimetraining$chas))
## [1] 0
sum(is.na(crimetraining$nox))
## [1] 0
sum(is.na(crimetraining$rm))
## [1] 0
sum(is.na(crimetraining$age))
## [1] 0
sum(is.na(crimetraining$dis))
## [1] 0
sum(is.na(crimetraining$rad))
## [1] 0
sum(is.na(crimetraining$tax))
## [1] 0
sum(is.na(crimetraining$black))
## [1] 0
sum(is.na(crimetraining$lstat))
## [1] 0
sum(is.na(crimetraining$medv))
## [1] 0
sum(is.na(crimetraining$target))
## [1] 0
#Summary Stats
summary(crimetraining)
##        zn             indus             chas              nox        
##  Min.   :  0.00   Min.   : 0.460   Min.   :0.00000   Min.   :0.3890  
##  1st Qu.:  0.00   1st Qu.: 5.145   1st Qu.:0.00000   1st Qu.:0.4480  
##  Median :  0.00   Median : 9.690   Median :0.00000   Median :0.5380  
##  Mean   : 11.58   Mean   :11.105   Mean   :0.07082   Mean   :0.5543  
##  3rd Qu.: 16.25   3rd Qu.:18.100   3rd Qu.:0.00000   3rd Qu.:0.6240  
##  Max.   :100.00   Max.   :27.740   Max.   :1.00000   Max.   :0.8710  
##        rm             age              dis              rad       
##  Min.   :3.863   Min.   :  2.90   Min.   : 1.130   Min.   : 1.00  
##  1st Qu.:5.887   1st Qu.: 43.88   1st Qu.: 2.101   1st Qu.: 4.00  
##  Median :6.210   Median : 77.15   Median : 3.191   Median : 5.00  
##  Mean   :6.291   Mean   : 68.37   Mean   : 3.796   Mean   : 9.53  
##  3rd Qu.:6.630   3rd Qu.: 94.10   3rd Qu.: 5.215   3rd Qu.:24.00  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.00  
##       tax           ptratio         black            lstat       
##  Min.   :187.0   Min.   :12.6   Min.   :  0.32   Min.   : 1.730  
##  1st Qu.:281.0   1st Qu.:16.9   1st Qu.:375.61   1st Qu.: 7.043  
##  Median :334.5   Median :18.9   Median :391.34   Median :11.350  
##  Mean   :409.5   Mean   :18.4   Mean   :357.12   Mean   :12.631  
##  3rd Qu.:666.0   3rd Qu.:20.2   3rd Qu.:396.24   3rd Qu.:16.930  
##  Max.   :711.0   Max.   :22.0   Max.   :396.90   Max.   :37.970  
##       medv           target      
##  Min.   : 5.00   Min.   :0.0000  
##  1st Qu.:17.02   1st Qu.:0.0000  
##  Median :21.20   Median :0.0000  
##  Mean   :22.59   Mean   :0.4914  
##  3rd Qu.:25.00   3rd Qu.:1.0000  
##  Max.   :50.00   Max.   :1.0000

A quick inspection of the descriptive summaries, the difference between the means and medians, offers a clue regarding the variables that might not be normally distributed: zn, chas (binary), age, rad, tax & black.

Below is a plot of histograms that confirm our suspicions in some cases.

par(mar=c(1,1,1,1))
par(mfrow=c(5,3))
hist(crimetraining$zn, cex.main=.7)
hist(crimetraining$indus,cex.main=.7 )
hist(crimetraining$chas, cex.main=.7)
hist(crimetraining$nox, cex.main=.7)
hist(crimetraining$rm, cex.main=.7)
hist(crimetraining$age, cex.main=.7)
hist(crimetraining$dis,cex.main=.7)
hist(crimetraining$rad,cex.main=.7)
hist(crimetraining$tax,cex.main=.7)
hist(crimetraining$rad,cex.main=.7)
hist(crimetraining$ptratio,cex.main=.7)
hist(crimetraining$black,cex.main=.7)
hist(crimetraining$medv,cex.main=.7)
hist(crimetraining$lstat,cex.main=.7)

Based on the correlation matrix below, it’s probably a good idea to drop “chas”, from the model. It’s correlation coefficient is very - even considering that it and the dependent variable are dichotomous models.

Also, rad and tax have .91 correlation, so it seems that one of them should be dropped from our model. We’ll see during our transformation, which one we decided to keep and why.

#Correlation Matrix
crimetrainingmatrix<- cor(crimetraining)
round(crimetrainingmatrix,2)
##            zn indus  chas   nox    rm   age   dis   rad   tax ptratio
## zn       1.00 -0.54 -0.04 -0.52  0.32 -0.57  0.66 -0.32 -0.32   -0.39
## indus   -0.54  1.00  0.06  0.76 -0.39  0.64 -0.70  0.60  0.73    0.39
## chas    -0.04  0.06  1.00  0.10  0.09  0.08 -0.10 -0.02 -0.05   -0.13
## nox     -0.52  0.76  0.10  1.00 -0.30  0.74 -0.77  0.60  0.65    0.18
## rm       0.32 -0.39  0.09 -0.30  1.00 -0.23  0.20 -0.21 -0.30   -0.36
## age     -0.57  0.64  0.08  0.74 -0.23  1.00 -0.75  0.46  0.51    0.26
## dis      0.66 -0.70 -0.10 -0.77  0.20 -0.75  1.00 -0.49 -0.53   -0.23
## rad     -0.32  0.60 -0.02  0.60 -0.21  0.46 -0.49  1.00  0.91    0.47
## tax     -0.32  0.73 -0.05  0.65 -0.30  0.51 -0.53  0.91  1.00    0.47
## ptratio -0.39  0.39 -0.13  0.18 -0.36  0.26 -0.23  0.47  0.47    1.00
## black    0.18 -0.36  0.04 -0.38  0.13 -0.27  0.29 -0.45 -0.44   -0.18
## lstat   -0.43  0.61 -0.05  0.60 -0.63  0.61 -0.51  0.50  0.56    0.38
## medv     0.38 -0.50  0.16 -0.43  0.71 -0.38  0.26 -0.40 -0.49   -0.52
## target  -0.43  0.60  0.08  0.73 -0.15  0.63 -0.62  0.63  0.61    0.25
##         black lstat  medv target
## zn       0.18 -0.43  0.38  -0.43
## indus   -0.36  0.61 -0.50   0.60
## chas     0.04 -0.05  0.16   0.08
## nox     -0.38  0.60 -0.43   0.73
## rm       0.13 -0.63  0.71  -0.15
## age     -0.27  0.61 -0.38   0.63
## dis      0.29 -0.51  0.26  -0.62
## rad     -0.45  0.50 -0.40   0.63
## tax     -0.44  0.56 -0.49   0.61
## ptratio -0.18  0.38 -0.52   0.25
## black    1.00 -0.35  0.33  -0.35
## lstat   -0.35  1.00 -0.74   0.47
## medv     0.33 -0.74  1.00  -0.27
## target  -0.35  0.47 -0.27   1.00
#Correlation to Target variable
corcrimetrainingtb <- data.table(crimetrainingmatrix, keep.rownames = TRUE)
corcrimetrainingtb[,c(1,15)]
##          rn      target
##  1:      zn -0.43168176
##  2:   indus  0.60485074
##  3:    chas  0.08004187
##  4:     nox  0.72610622
##  5:      rm -0.15255334
##  6:     age  0.63010625
##  7:     dis -0.61867312
##  8:     rad  0.62810492
##  9:     tax  0.61111331
## 10: ptratio  0.25084892
## 11:   black -0.35295680
## 12:   lstat  0.46912702
## 13:    medv -0.27055071
## 14:  target  1.00000000
pairs(~zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv, data=crimetraining, main = "Crime Training All")

DATA PREPARATION

Based on our initial loading of the data, there no values missing - eliminating the need for imputation.

The only dichotomous independent variable that we had in the model was actually removed, because of a low correlation to the dependent variable (target). Further, we’ll say later on with our AIC results, that the optimal model actually gets drops this variable as well - confirming our good judgment here. The primary mathematical modeling that was done here attempts to transform skewed of bimodal variables to a norm distribution using log transformations, and then for a the full model using Box-Cox transformations.

Although chas, and tax have bimodal and right skewed qualities - they have already been removed from our model. The renaming variables that need transformation are black, rad, tax & indus. Of these that underwent a natural log transformation, indus and rad became normally distributed.

Because the Box-Cox transformation was performed on an entire model, we’ll save those results for the model building section.

#natural log transformation
crimetraining$lnblack<-log(crimetraining$black)
crimetraining$lnrad<-log(crimetraining$rad)
crimetraining$lntax<-log(crimetraining$tax)
crimetraining$lnindus<-log(crimetraining$indus)

par(mar=c(1,1,1,1))
par(mfrow=c(2,2))
#retain bimodal qualities
hist(crimetraining$lnblack)
hist(crimetraining$lntax)

#become normally distributed
hist(crimetraining$lnindus)
hist(crimetraining$lnrad)

Model Buildling

Given the transformations above it’s time to figure out the best model for our data. We’ll use the principles of backwards and forwards selection via options in the AIC package, as well as using a more manual approach - with an eye always on multicollinearity.

Our first logit model starts with all variables, except “chas” (this was eliminated earlier), without any transformation. We’ll also look at its R2, and compare it to another logit model (logitA), with transformed rad (lnrad) and indus(lnindus) variables. We’ll also attempt the Box-Cox transformation on the original model, with a change to the dependent variable only.

  1. The original logit model(logit) yields a McFadden’ pseudo \R^2 of .83, however it has difference signs for rad and tax…and we know that these variables have an almost perfect linear relationship - indicating the possibility of multicollinearity.

  2. The second logit model(logitA), transforms indus and rad to their natural log forms, and drops tax. In this form the model yields an \R^2 of .82…with statistical significance for zn, lnindus, nox, age, dis, rad & ptratio - all with expected coefficient signs.

  3. The only transformation to our dependent variable that ensured positive non-zero values for the Box-Cox, was to add a constant (.0001), transforming “target” to “target1”. The Box-Cox transformation yields a \lambda of zero. Therefore, this wasn’t useful in created a new regression model. Any further attempt to use Box-Cox was abandoned.

Finally, the AIC package was used on our original logit model that contained all variables, prior to any log transformations.

The model that yielded the lowest AIC value (220), included the following variables: zn + nox + age + dis + lnrad + ptratio + black + medv.

This further indicates, that we were correct to remove tax from our model, in favor of rad (lnrad). The AIC was deployed against the all variables without an transformations applied, in our final model we will substitute “rad” with “lnrad”, and “indus” with “lnindus”, because of the their more desirable distribution.

The final analysis our model tells us that while socio-economic factors such as the number of rooms in a dwelling(rm),race (black) and lower social status (lstat) have an intuitive appeal in predicting an incidence of crime; almost all of our socio-economic variables were statistically significant. However, the variables that were statistically significant, such as proportion of a area that is zoned for large lots, how much manufacturing occurs as represented by the amount of nitrogen oxides (nox), age of the buildings and distance to highway (rad) - all point to location, perhaps in isolated urban industrial areas as being the predictors that have a statistically significant relationship to crime outcomes. Earlier, we mentioned that most of the socio-economic factors in our model were not statistically significant, however two variables that might be considered socio-economic did show statistical significance - distance to employment office (dis) and pupil/teacher ratios in classrooms. The “dis” variable might be as much as an indicator of deserted areas and location, as much as it is socio-economic status. Also, the ratio of pupils to teacher (ptratio), while almost certainly a socio-economic indicator - is also an indicator of population density, and whether or not an area is urban or suburban.

The final model tells us that geographic proximity is more important socio-economic variables, especially those that demographic such as race and status. However, more bi-directional analysis is required to understand the relationship between variables such as nitrogen oxide volume (the most impactful of our variables) and other independent variables; to determine if it is a proxy for unexplained socio-economic factors not captured in our model.

#install.packages("rms")
library(rms)
## Warning: package 'rms' was built under R version 3.3.2
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 3.3.2
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     combine, src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
#ols=lm(target ~ zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv, data = crimetraining)
#summary(ols)

#contains coefficients and AIC
logit=glm(target ~ zn+indus+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv, data = crimetraining, family = binomial(link = "logit"))
summary(logit)
## 
## Call:
## glm(formula = target ~ zn + indus + nox + rm + age + dis + rad + 
##     tax + ptratio + black + lstat + medv, family = binomial(link = "logit"), 
##     data = crimetraining)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2099  -0.1356  -0.0013   0.0018   3.4691  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -35.393661   6.866186  -5.155 2.54e-07 ***
## zn           -0.069129   0.034604  -1.998  0.04575 *  
## indus        -0.057736   0.046446  -1.243  0.21384    
## nox          48.003143   7.781466   6.169 6.88e-10 ***
## rm           -0.703120   0.738498  -0.952  0.34105    
## age           0.035141   0.013766   2.553  0.01069 *  
## dis           0.738029   0.231466   3.189  0.00143 ** 
## rad           0.720422   0.163465   4.407 1.05e-05 ***
## tax          -0.007419   0.003002  -2.472  0.01345 *  
## ptratio       0.409562   0.128375   3.190  0.00142 ** 
## black        -0.012478   0.006486  -1.924  0.05438 .  
## lstat         0.057059   0.053145   1.074  0.28298    
## medv          0.201062   0.070966   2.833  0.00461 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 188.02  on 453  degrees of freedom
## AIC: 214.02
## 
## Number of Fisher Scoring iterations: 9
#contains coefficients and R2
logitlrm<-lrm(target ~ zn+indus+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv, data = crimetraining)
logitlrm
## Logistic Regression Model
##  
##  lrm(formula = target ~ zn + indus + nox + rm + age + dis + rad + 
##      tax + ptratio + black + lstat + medv, data = crimetraining)
##  
##                        Model Likelihood     Discrimination     Rank Discrim.    
##                           Ratio Test            Indexes           Indexes       
##  Obs           466    LR chi2     457.86    R2        0.834    C       0.975    
##   0            237    d.f.            12    g        11.225    Dxy     0.949    
##   1            229    Pr(> chi2) <0.0001    gr    74998.421    gamma   0.949    
##  max |deriv| 8e-07                          gp        0.475    tau-a   0.475    
##                                             Brier     0.062                     
##  
##            Coef     S.E.   Wald Z Pr(>|Z|)
##  Intercept -35.3937 6.8662 -5.15  <0.0001 
##  zn         -0.0691 0.0346 -2.00  0.0457  
##  indus      -0.0577 0.0464 -1.24  0.2138  
##  nox        48.0031 7.7815  6.17  <0.0001 
##  rm         -0.7031 0.7385 -0.95  0.3410  
##  age         0.0351 0.0138  2.55  0.0107  
##  dis         0.7380 0.2315  3.19  0.0014  
##  rad         0.7204 0.1635  4.41  <0.0001 
##  tax        -0.0074 0.0030 -2.47  0.0134  
##  ptratio     0.4096 0.1284  3.19  0.0014  
##  black      -0.0125 0.0065 -1.92  0.0544  
##  lstat       0.0571 0.0531  1.07  0.2830  
##  medv        0.2011 0.0710  2.83  0.0046  
## 
logitA=glm(target ~ zn+lnindus+nox+rm+age+dis+lnrad+ptratio+black+lstat+medv, data = crimetraining, family = binomial(link = "logit"))
summary(logitA)
## 
## Call:
## glm(formula = target ~ zn + lnindus + nox + rm + age + dis + 
##     lnrad + ptratio + black + lstat + medv, family = binomial(link = "logit"), 
##     data = crimetraining)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0672  -0.1793  -0.0014   0.0521   3.5217  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -33.666084   6.399740  -5.261 1.44e-07 ***
## zn           -0.075896   0.033078  -2.294 0.021764 *  
## lnindus      -0.257639   0.465543  -0.553 0.579978    
## nox          39.674508   6.832508   5.807 6.37e-09 ***
## rm           -0.757564   0.710380  -1.066 0.286233    
## age           0.034624   0.013179   2.627 0.008607 ** 
## dis           0.813769   0.222254   3.661 0.000251 ***
## lnrad         2.721452   0.611634   4.449 8.61e-06 ***
## ptratio       0.343898   0.119984   2.866 0.004154 ** 
## black        -0.010315   0.005888  -1.752 0.079790 .  
## lstat         0.028100   0.052392   0.536 0.591727    
## medv          0.223111   0.070588   3.161 0.001574 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 200.53  on 454  degrees of freedom
## AIC: 224.53
## 
## Number of Fisher Scoring iterations: 8
logitAlrm=lrm(target ~ zn+lnindus+nox+rm+age+dis+lnrad+ptratio+black+lstat+medv, data = crimetraining)
logitAlrm
## Logistic Regression Model
##  
##  lrm(formula = target ~ zn + lnindus + nox + rm + age + dis + 
##      lnrad + ptratio + black + lstat + medv, data = crimetraining)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           466    LR chi2     445.35    R2       0.821    C       0.970    
##   0            237    d.f.            11    g        7.635    Dxy     0.941    
##   1            229    Pr(> chi2) <0.0001    gr    2070.326    gamma   0.942    
##  max |deriv| 4e-05                          gp       0.471    tau-a   0.471    
##                                             Brier    0.068                     
##  
##            Coef     S.E.   Wald Z Pr(>|Z|)
##  Intercept -33.6661 6.3998 -5.26  <0.0001 
##  zn         -0.0759 0.0331 -2.29  0.0218  
##  lnindus    -0.2576 0.4655 -0.55  0.5800  
##  nox        39.6745 6.8326  5.81  <0.0001 
##  rm         -0.7576 0.7104 -1.07  0.2862  
##  age         0.0346 0.0132  2.63  0.0086  
##  dis         0.8138 0.2223  3.66  0.0003  
##  lnrad       2.7215 0.6116  4.45  <0.0001 
##  ptratio     0.3439 0.1200  2.87  0.0042  
##  black      -0.0103 0.0059 -1.75  0.0798  
##  lstat       0.0281 0.0524  0.54  0.5917  
##  medv        0.2231 0.0706  3.16  0.0016  
## 
crimetraining$target1<-crimetraining$target
crimetraining$target1<-as.numeric(crimetraining$target1)
crimetraining$target1[crimetraining$target1=="0"] <- ".00001"
ols1=lm(target1 ~ zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv, data = crimetraining)
boxcox(ols1)

crime.step<-stepAIC(logit, direction = c("both", "backward", "forward"))
## Start:  AIC=214.02
## target ~ zn + indus + nox + rm + age + dis + rad + tax + ptratio + 
##     black + lstat + medv
## 
##           Df Deviance    AIC
## - rm       1   188.93 212.93
## - lstat    1   189.17 213.17
## - indus    1   189.62 213.62
## <none>         188.02 214.02
## - zn       1   193.38 217.38
## - black    1   193.53 217.53
## - tax      1   194.72 218.72
## - age      1   195.27 219.27
## - medv     1   197.30 221.30
## - ptratio  1   199.31 223.31
## - dis      1   199.77 223.77
## - rad      1   231.89 255.89
## - nox      1   257.55 281.55
## 
## Step:  AIC=212.93
## target ~ zn + indus + nox + age + dis + rad + tax + ptratio + 
##     black + lstat + medv
## 
##           Df Deviance    AIC
## - indus    1   190.51 212.51
## <none>         188.93 212.93
## - lstat    1   191.35 213.35
## + rm       1   188.02 214.02
## - black    1   194.24 216.24
## - zn       1   194.83 216.83
## - age      1   195.56 217.56
## - tax      1   196.11 218.11
## - ptratio  1   199.41 221.41
## - dis      1   199.80 221.80
## - medv     1   203.37 225.37
## - rad      1   232.21 254.21
## - nox      1   257.56 279.56
## 
## Step:  AIC=212.51
## target ~ zn + nox + age + dis + rad + tax + ptratio + black + 
##     lstat + medv
## 
##           Df Deviance    AIC
## <none>         190.51 212.51
## - lstat    1   192.57 212.57
## + indus    1   188.93 212.93
## + rm       1   189.62 213.62
## - black    1   195.51 215.51
## - zn       1   196.67 216.67
## - age      1   196.98 216.98
## - ptratio  1   200.51 220.51
## - dis      1   201.23 221.23
## - tax      1   202.79 222.79
## - medv     1   204.52 224.52
## - rad      1   240.50 260.50
## - nox      1   261.63 281.63
crime.step$anova
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## target ~ zn + indus + nox + rm + age + dis + rad + tax + ptratio + 
##     black + lstat + medv
## 
## Final Model:
## target ~ zn + nox + age + dis + rad + tax + ptratio + black + 
##     lstat + medv
## 
## 
##      Step Df  Deviance Resid. Df Resid. Dev      AIC
## 1                            453   188.0190 214.0190
## 2    - rm  1 0.9131359       454   188.9321 212.9321
## 3 - indus  1 1.5819995       455   190.5141 212.5141

Model Selection

Not that we’ve analyzed the results from our model, it’s time to start building prediction, and applying them to our test data set.

Based on a threshold of .5, our model is yielding a measures of prediction (accuracy, CER, precision, sensitivity and specificity), all in the area of .9 (see output below). This indicates that our prediction is very good, and hopefully it yield similar prevalence results (incidence of response value “yes”) in the training and test data sets. Also, our AUC value is .97, an indicator of the strength of our predictive model, because it tells us how well we are separate the positive and negative response groups.

It’s worth noting that our custom functions for prediction measures, match what is produced in the confusionMatrix package.

Finally, after applying our prediction values to the test data set, we’re able to compare the prevalence number from our training (see confusionMatrix) and our manual calculation from our test data sets and they are virtually the same (.49 vs. .50).

See the results below.

knitr::opts_chunk$set(error = TRUE)
logitfinal=glm(target ~ zn+nox+rm+age+dis+lnrad+ptratio+black+medv, data = crimetraining, family = binomial(link = "logit"))

logitfinallrm=lrm(target ~ zn+nox+rm+age+dis+lnrad+ptratio+black+medv, data = crimetraining)
logitfinallrm
## Logistic Regression Model
##  
##  lrm(formula = target ~ zn + nox + rm + age + dis + lnrad + ptratio + 
##      black + medv, data = crimetraining)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           466    LR chi2     444.83    R2       0.820    C       0.970    
##   0            237    d.f.             9    g        7.511    Dxy     0.941    
##   1            229    Pr(> chi2) <0.0001    gr    1827.933    gamma   0.942    
##  max |deriv| 2e-05                          gp       0.471    tau-a   0.471    
##                                             Brier    0.068                     
##  
##            Coef     S.E.   Wald Z Pr(>|Z|)
##  Intercept -32.7537 6.2282 -5.26  <0.0001 
##  zn         -0.0712 0.0318 -2.24  0.0250  
##  nox        38.6106 6.3523  6.08  <0.0001 
##  rm         -0.9085 0.6657 -1.36  0.1723  
##  age         0.0375 0.0120  3.12  0.0018  
##  dis         0.8210 0.2229  3.68  0.0002  
##  lnrad       2.6980 0.6024  4.48  <0.0001 
##  ptratio     0.3450 0.1200  2.87  0.0041  
##  black      -0.0103 0.0059 -1.74  0.0817  
##  medv        0.2303 0.0692  3.33  0.0009  
## 
#model evaluation
logit.prob=predict(logitfinal,type = "response")
logit.pred=ifelse(logit.prob>=0.5,"1","0")
crimetraining$pred <- logit.pred
crimetraining$pred<-as.numeric(crimetraining$pred)
is.numeric(crimetraining$pred)
## [1] TRUE
crimetraining$prob <- logit.prob
crimetraining$prob <- round(crimetraining$prob, digits = 2)

crimetraining <- dplyr::mutate(crimetraining, compare = crimetraining$target + crimetraining$pred)

TP<- nrow(crimetraining[(crimetraining$compare ==2 & crimetraining$pred == 1),])
FP <- nrow(crimetraining[(crimetraining$compare ==1 & crimetraining$pred == 1),])
TN <- nrow(crimetraining[(crimetraining$compare ==0 & crimetraining$pred == 0),])
FN <- nrow(crimetraining[(crimetraining$compare ==1 & crimetraining$pred == 0),])

myAccuracy <- function() {(TP + TN)/(TP+TN+FP+FN)}

myCER <- function() {(FP + FN)/(TP+FP+TN+FN)}

myPrecision <- function() {TP/(TP + FP)}

mySensitivity <- function() {TP/(TP + FN)}

mySpecificity <- function() {TN/(TN + FP)}

Precision <- TP/(TP + FP)
Sensitivity <- TP/(TP + FN)
myF1Score <- function() {(2 * Precision * Sensitivity)/(Precision + Sensitivity)
}
myAllFunction <- function(j) {(j = c(myAccuracy(),myCER(),myPrecision(),mySensitivity(),mySpecificity(),myF1Score()))}
allfunctions <-c(myAllFunction())
names(allfunctions) = c("Accuracy", "CER", "Precision", "Sensitivity", "Specificity", "F1")
allfunctions
##    Accuracy         CER   Precision Sensitivity Specificity          F1 
##  0.90772532  0.09227468  0.91891892  0.89082969  0.92405063  0.90465632
plot(roc(crimetraining$target, crimetraining$prob, direction="<"),col="blue", lwd=3, main="Classification", print.auc=TRUE)

confusionMatrix(crimetraining$pred, crimetraining$target, positive = "1", dnn = c("Prediction", "Reference"), prevalence = NULL)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 219  25
##          1  18 204
##                                           
##                Accuracy : 0.9077          
##                  95% CI : (0.8777, 0.9324)
##     No Information Rate : 0.5086          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8153          
##  Mcnemar's Test P-Value : 0.3602          
##                                           
##             Sensitivity : 0.8908          
##             Specificity : 0.9241          
##          Pos Pred Value : 0.9189          
##          Neg Pred Value : 0.8975          
##              Prevalence : 0.4914          
##          Detection Rate : 0.4378          
##    Detection Prevalence : 0.4764          
##       Balanced Accuracy : 0.9074          
##                                           
##        'Positive' Class : 1               
## 
#add log(rad)
crimeeval$lnrad<-log(crimeeval$rad)

logit.evalprob <- round(predict(logitfinal, crimeeval, type = "response"), digits = 2)
logit.evalprob
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 0.05 0.64 0.63 0.64 0.09 0.23 0.24 0.01 0.00 0.00 0.14 0.12 0.84 0.96 0.75 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30 
## 0.18 0.33 0.98 0.15 0.00 0.00 0.02 0.11 0.20 0.17 0.53 0.00 1.00 1.00 1.00 
##   31   32   33   34   35   36   37   38   39   40 
## 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 0.76 0.04
logit.evalpred <- ifelse(logit.evalprob>=0.5,"1","0")
logit.evalpred
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## "0" "1" "1" "1" "0" "0" "0" "0" "0" "0" "0" "0" "1" "1" "1" "0" "0" "1" 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
## "0" "0" "0" "0" "0" "0" "0" "1" "0" "1" "1" "1" "1" "1" "1" "1" "1" "1" 
##  37  38  39  40 
## "1" "1" "1" "0"
logit.evalpred<-as.numeric(logit.evalpred)
round(sum(logit.evalpred)/length(logit.evalpred) , digits = 2)
## [1] 0.5

Source: Why Data Scientists Split Data into Train and Test

Source: AIC Multicollinearity

Source: What is a Logit Function and Why Use Logistic Regression