library(data.table)
## Warning: package 'data.table' was built under R version 3.3.2
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 3.3.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.2
library(pROC)
## Warning: package 'pROC' was built under R version 3.3.2
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
#please connect to wifi
crimeeval <- "https://raw.githubusercontent.com/Misterresearch/CUNY-Projects/master/crime-evaluation-data.csv"
crimetraining <-"https://raw.githubusercontent.com/Misterresearch/CUNY-Projects/master/crime-training-data.csv"
crimeeval <- read.table(file = crimeeval, header = TRUE, sep = ",", na.strings = "NA")
crimetraining <- read.table(file = crimetraining, header = TRUE, sep = ",", na.strings = "NA")
DATA EXPLORATION
For this study, we’ve been provided with a two-way random partition of evaluation and training data sets. The evaluation/test data is not used for our predictive model. However, we will analyze the evaluation set’s mean “yes” classification (prevalence), and compare it against the training data set once our probabilities and classifications have been created.
However, first we will check for any missing values in our training data set. The check below reveals that there are no missing data, and as a result we do not need to develop methods for imputation going forward.
#NA check, no imputation required
sum(is.na(crimetraining$zn))
## [1] 0
sum(is.na(crimetraining$indus))
## [1] 0
sum(is.na(crimetraining$chas))
## [1] 0
sum(is.na(crimetraining$nox))
## [1] 0
sum(is.na(crimetraining$rm))
## [1] 0
sum(is.na(crimetraining$age))
## [1] 0
sum(is.na(crimetraining$dis))
## [1] 0
sum(is.na(crimetraining$rad))
## [1] 0
sum(is.na(crimetraining$tax))
## [1] 0
sum(is.na(crimetraining$black))
## [1] 0
sum(is.na(crimetraining$lstat))
## [1] 0
sum(is.na(crimetraining$medv))
## [1] 0
sum(is.na(crimetraining$target))
## [1] 0
#Summary Stats
summary(crimetraining)
## zn indus chas nox
## Min. : 0.00 Min. : 0.460 Min. :0.00000 Min. :0.3890
## 1st Qu.: 0.00 1st Qu.: 5.145 1st Qu.:0.00000 1st Qu.:0.4480
## Median : 0.00 Median : 9.690 Median :0.00000 Median :0.5380
## Mean : 11.58 Mean :11.105 Mean :0.07082 Mean :0.5543
## 3rd Qu.: 16.25 3rd Qu.:18.100 3rd Qu.:0.00000 3rd Qu.:0.6240
## Max. :100.00 Max. :27.740 Max. :1.00000 Max. :0.8710
## rm age dis rad
## Min. :3.863 Min. : 2.90 Min. : 1.130 Min. : 1.00
## 1st Qu.:5.887 1st Qu.: 43.88 1st Qu.: 2.101 1st Qu.: 4.00
## Median :6.210 Median : 77.15 Median : 3.191 Median : 5.00
## Mean :6.291 Mean : 68.37 Mean : 3.796 Mean : 9.53
## 3rd Qu.:6.630 3rd Qu.: 94.10 3rd Qu.: 5.215 3rd Qu.:24.00
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.00
## tax ptratio black lstat
## Min. :187.0 Min. :12.6 Min. : 0.32 Min. : 1.730
## 1st Qu.:281.0 1st Qu.:16.9 1st Qu.:375.61 1st Qu.: 7.043
## Median :334.5 Median :18.9 Median :391.34 Median :11.350
## Mean :409.5 Mean :18.4 Mean :357.12 Mean :12.631
## 3rd Qu.:666.0 3rd Qu.:20.2 3rd Qu.:396.24 3rd Qu.:16.930
## Max. :711.0 Max. :22.0 Max. :396.90 Max. :37.970
## medv target
## Min. : 5.00 Min. :0.0000
## 1st Qu.:17.02 1st Qu.:0.0000
## Median :21.20 Median :0.0000
## Mean :22.59 Mean :0.4914
## 3rd Qu.:25.00 3rd Qu.:1.0000
## Max. :50.00 Max. :1.0000
A quick inspection of the descriptive summaries, the difference between the means and medians, offers a clue regarding the variables that might not be normally distributed: zn, chas (binary), age, rad, tax & black.
Below is a plot of histograms that confirm our suspicions in some cases.
par(mar=c(1,1,1,1))
par(mfrow=c(5,3))
hist(crimetraining$zn, cex.main=.7)
hist(crimetraining$indus,cex.main=.7 )
hist(crimetraining$chas, cex.main=.7)
hist(crimetraining$nox, cex.main=.7)
hist(crimetraining$rm, cex.main=.7)
hist(crimetraining$age, cex.main=.7)
hist(crimetraining$dis,cex.main=.7)
hist(crimetraining$rad,cex.main=.7)
hist(crimetraining$tax,cex.main=.7)
hist(crimetraining$rad,cex.main=.7)
hist(crimetraining$ptratio,cex.main=.7)
hist(crimetraining$black,cex.main=.7)
hist(crimetraining$medv,cex.main=.7)
hist(crimetraining$lstat,cex.main=.7)
Based on the correlation matrix below, it’s probably a good idea to drop “chas”, from the model. It’s correlation coefficient is very - even considering that it and the dependent variable are dichotomous models.
Also, rad and tax have .91 correlation, so it seems that one of them should be dropped from our model. We’ll see during our transformation, which one we decided to keep and why.
#Correlation Matrix
crimetrainingmatrix<- cor(crimetraining)
round(crimetrainingmatrix,2)
## zn indus chas nox rm age dis rad tax ptratio
## zn 1.00 -0.54 -0.04 -0.52 0.32 -0.57 0.66 -0.32 -0.32 -0.39
## indus -0.54 1.00 0.06 0.76 -0.39 0.64 -0.70 0.60 0.73 0.39
## chas -0.04 0.06 1.00 0.10 0.09 0.08 -0.10 -0.02 -0.05 -0.13
## nox -0.52 0.76 0.10 1.00 -0.30 0.74 -0.77 0.60 0.65 0.18
## rm 0.32 -0.39 0.09 -0.30 1.00 -0.23 0.20 -0.21 -0.30 -0.36
## age -0.57 0.64 0.08 0.74 -0.23 1.00 -0.75 0.46 0.51 0.26
## dis 0.66 -0.70 -0.10 -0.77 0.20 -0.75 1.00 -0.49 -0.53 -0.23
## rad -0.32 0.60 -0.02 0.60 -0.21 0.46 -0.49 1.00 0.91 0.47
## tax -0.32 0.73 -0.05 0.65 -0.30 0.51 -0.53 0.91 1.00 0.47
## ptratio -0.39 0.39 -0.13 0.18 -0.36 0.26 -0.23 0.47 0.47 1.00
## black 0.18 -0.36 0.04 -0.38 0.13 -0.27 0.29 -0.45 -0.44 -0.18
## lstat -0.43 0.61 -0.05 0.60 -0.63 0.61 -0.51 0.50 0.56 0.38
## medv 0.38 -0.50 0.16 -0.43 0.71 -0.38 0.26 -0.40 -0.49 -0.52
## target -0.43 0.60 0.08 0.73 -0.15 0.63 -0.62 0.63 0.61 0.25
## black lstat medv target
## zn 0.18 -0.43 0.38 -0.43
## indus -0.36 0.61 -0.50 0.60
## chas 0.04 -0.05 0.16 0.08
## nox -0.38 0.60 -0.43 0.73
## rm 0.13 -0.63 0.71 -0.15
## age -0.27 0.61 -0.38 0.63
## dis 0.29 -0.51 0.26 -0.62
## rad -0.45 0.50 -0.40 0.63
## tax -0.44 0.56 -0.49 0.61
## ptratio -0.18 0.38 -0.52 0.25
## black 1.00 -0.35 0.33 -0.35
## lstat -0.35 1.00 -0.74 0.47
## medv 0.33 -0.74 1.00 -0.27
## target -0.35 0.47 -0.27 1.00
#Correlation to Target variable
corcrimetrainingtb <- data.table(crimetrainingmatrix, keep.rownames = TRUE)
corcrimetrainingtb[,c(1,15)]
## rn target
## 1: zn -0.43168176
## 2: indus 0.60485074
## 3: chas 0.08004187
## 4: nox 0.72610622
## 5: rm -0.15255334
## 6: age 0.63010625
## 7: dis -0.61867312
## 8: rad 0.62810492
## 9: tax 0.61111331
## 10: ptratio 0.25084892
## 11: black -0.35295680
## 12: lstat 0.46912702
## 13: medv -0.27055071
## 14: target 1.00000000
pairs(~zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv, data=crimetraining, main = "Crime Training All")
DATA PREPARATION
Based on our initial loading of the data, there no values missing - eliminating the need for imputation.
The only dichotomous independent variable that we had in the model was actually removed, because of a low correlation to the dependent variable (target). Further, we’ll say later on with our AIC results, that the optimal model actually gets drops this variable as well - confirming our good judgment here. The primary mathematical modeling that was done here attempts to transform skewed of bimodal variables to a norm distribution using log transformations, and then for a the full model using Box-Cox transformations.
Although chas, and tax have bimodal and right skewed qualities - they have already been removed from our model. The renaming variables that need transformation are black, rad, tax & indus. Of these that underwent a natural log transformation, indus and rad became normally distributed.
Because the Box-Cox transformation was performed on an entire model, we’ll save those results for the model building section.
#natural log transformation
crimetraining$lnblack<-log(crimetraining$black)
crimetraining$lnrad<-log(crimetraining$rad)
crimetraining$lntax<-log(crimetraining$tax)
crimetraining$lnindus<-log(crimetraining$indus)
par(mar=c(1,1,1,1))
par(mfrow=c(2,2))
#retain bimodal qualities
hist(crimetraining$lnblack)
hist(crimetraining$lntax)
#become normally distributed
hist(crimetraining$lnindus)
hist(crimetraining$lnrad)
Model Buildling
Given the transformations above it’s time to figure out the best model for our data. We’ll use the principles of backwards and forwards selection via options in the AIC package, as well as using a more manual approach - with an eye always on multicollinearity.
Our first logit model starts with all variables, except “chas” (this was eliminated earlier), without any transformation. We’ll also look at its R2, and compare it to another logit model (logitA), with transformed rad (lnrad) and indus(lnindus) variables. We’ll also attempt the Box-Cox transformation on the original model, with a change to the dependent variable only.
The original logit model(logit) yields a McFadden’ pseudo \R^2 of .83, however it has difference signs for rad and tax…and we know that these variables have an almost perfect linear relationship - indicating the possibility of multicollinearity.
The second logit model(logitA), transforms indus and rad to their natural log forms, and drops tax. In this form the model yields an \R^2 of .82…with statistical significance for zn, lnindus, nox, age, dis, rad & ptratio - all with expected coefficient signs.
The only transformation to our dependent variable that ensured positive non-zero values for the Box-Cox, was to add a constant (.0001), transforming “target” to “target1”. The Box-Cox transformation yields a \lambda of zero. Therefore, this wasn’t useful in created a new regression model. Any further attempt to use Box-Cox was abandoned.
Finally, the AIC package was used on our original logit model that contained all variables, prior to any log transformations.
The model that yielded the lowest AIC value (220), included the following variables: zn + nox + age + dis + lnrad + ptratio + black + medv.
This further indicates, that we were correct to remove tax from our model, in favor of rad (lnrad). The AIC was deployed against the all variables without an transformations applied, in our final model we will substitute “rad” with “lnrad”, and “indus” with “lnindus”, because of the their more desirable distribution.
The final analysis our model tells us that while socio-economic factors such as the number of rooms in a dwelling(rm),race (black) and lower social status (lstat) have an intuitive appeal in predicting an incidence of crime; almost all of our socio-economic variables were statistically significant. However, the variables that were statistically significant, such as proportion of a area that is zoned for large lots, how much manufacturing occurs as represented by the amount of nitrogen oxides (nox), age of the buildings and distance to highway (rad) - all point to location, perhaps in isolated urban industrial areas as being the predictors that have a statistically significant relationship to crime outcomes. Earlier, we mentioned that most of the socio-economic factors in our model were not statistically significant, however two variables that might be considered socio-economic did show statistical significance - distance to employment office (dis) and pupil/teacher ratios in classrooms. The “dis” variable might be as much as an indicator of deserted areas and location, as much as it is socio-economic status. Also, the ratio of pupils to teacher (ptratio), while almost certainly a socio-economic indicator - is also an indicator of population density, and whether or not an area is urban or suburban.
The final model tells us that geographic proximity is more important socio-economic variables, especially those that demographic such as race and status. However, more bi-directional analysis is required to understand the relationship between variables such as nitrogen oxide volume (the most impactful of our variables) and other independent variables; to determine if it is a proxy for unexplained socio-economic factors not captured in our model.
#install.packages("rms")
library(rms)
## Warning: package 'rms' was built under R version 3.3.2
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 3.3.2
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## combine, src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
#ols=lm(target ~ zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv, data = crimetraining)
#summary(ols)
#contains coefficients and AIC
logit=glm(target ~ zn+indus+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv, data = crimetraining, family = binomial(link = "logit"))
summary(logit)
##
## Call:
## glm(formula = target ~ zn + indus + nox + rm + age + dis + rad +
## tax + ptratio + black + lstat + medv, family = binomial(link = "logit"),
## data = crimetraining)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2099 -0.1356 -0.0013 0.0018 3.4691
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -35.393661 6.866186 -5.155 2.54e-07 ***
## zn -0.069129 0.034604 -1.998 0.04575 *
## indus -0.057736 0.046446 -1.243 0.21384
## nox 48.003143 7.781466 6.169 6.88e-10 ***
## rm -0.703120 0.738498 -0.952 0.34105
## age 0.035141 0.013766 2.553 0.01069 *
## dis 0.738029 0.231466 3.189 0.00143 **
## rad 0.720422 0.163465 4.407 1.05e-05 ***
## tax -0.007419 0.003002 -2.472 0.01345 *
## ptratio 0.409562 0.128375 3.190 0.00142 **
## black -0.012478 0.006486 -1.924 0.05438 .
## lstat 0.057059 0.053145 1.074 0.28298
## medv 0.201062 0.070966 2.833 0.00461 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 188.02 on 453 degrees of freedom
## AIC: 214.02
##
## Number of Fisher Scoring iterations: 9
#contains coefficients and R2
logitlrm<-lrm(target ~ zn+indus+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv, data = crimetraining)
logitlrm
## Logistic Regression Model
##
## lrm(formula = target ~ zn + indus + nox + rm + age + dis + rad +
## tax + ptratio + black + lstat + medv, data = crimetraining)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 466 LR chi2 457.86 R2 0.834 C 0.975
## 0 237 d.f. 12 g 11.225 Dxy 0.949
## 1 229 Pr(> chi2) <0.0001 gr 74998.421 gamma 0.949
## max |deriv| 8e-07 gp 0.475 tau-a 0.475
## Brier 0.062
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -35.3937 6.8662 -5.15 <0.0001
## zn -0.0691 0.0346 -2.00 0.0457
## indus -0.0577 0.0464 -1.24 0.2138
## nox 48.0031 7.7815 6.17 <0.0001
## rm -0.7031 0.7385 -0.95 0.3410
## age 0.0351 0.0138 2.55 0.0107
## dis 0.7380 0.2315 3.19 0.0014
## rad 0.7204 0.1635 4.41 <0.0001
## tax -0.0074 0.0030 -2.47 0.0134
## ptratio 0.4096 0.1284 3.19 0.0014
## black -0.0125 0.0065 -1.92 0.0544
## lstat 0.0571 0.0531 1.07 0.2830
## medv 0.2011 0.0710 2.83 0.0046
##
logitA=glm(target ~ zn+lnindus+nox+rm+age+dis+lnrad+ptratio+black+lstat+medv, data = crimetraining, family = binomial(link = "logit"))
summary(logitA)
##
## Call:
## glm(formula = target ~ zn + lnindus + nox + rm + age + dis +
## lnrad + ptratio + black + lstat + medv, family = binomial(link = "logit"),
## data = crimetraining)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0672 -0.1793 -0.0014 0.0521 3.5217
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -33.666084 6.399740 -5.261 1.44e-07 ***
## zn -0.075896 0.033078 -2.294 0.021764 *
## lnindus -0.257639 0.465543 -0.553 0.579978
## nox 39.674508 6.832508 5.807 6.37e-09 ***
## rm -0.757564 0.710380 -1.066 0.286233
## age 0.034624 0.013179 2.627 0.008607 **
## dis 0.813769 0.222254 3.661 0.000251 ***
## lnrad 2.721452 0.611634 4.449 8.61e-06 ***
## ptratio 0.343898 0.119984 2.866 0.004154 **
## black -0.010315 0.005888 -1.752 0.079790 .
## lstat 0.028100 0.052392 0.536 0.591727
## medv 0.223111 0.070588 3.161 0.001574 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 200.53 on 454 degrees of freedom
## AIC: 224.53
##
## Number of Fisher Scoring iterations: 8
logitAlrm=lrm(target ~ zn+lnindus+nox+rm+age+dis+lnrad+ptratio+black+lstat+medv, data = crimetraining)
logitAlrm
## Logistic Regression Model
##
## lrm(formula = target ~ zn + lnindus + nox + rm + age + dis +
## lnrad + ptratio + black + lstat + medv, data = crimetraining)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 466 LR chi2 445.35 R2 0.821 C 0.970
## 0 237 d.f. 11 g 7.635 Dxy 0.941
## 1 229 Pr(> chi2) <0.0001 gr 2070.326 gamma 0.942
## max |deriv| 4e-05 gp 0.471 tau-a 0.471
## Brier 0.068
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -33.6661 6.3998 -5.26 <0.0001
## zn -0.0759 0.0331 -2.29 0.0218
## lnindus -0.2576 0.4655 -0.55 0.5800
## nox 39.6745 6.8326 5.81 <0.0001
## rm -0.7576 0.7104 -1.07 0.2862
## age 0.0346 0.0132 2.63 0.0086
## dis 0.8138 0.2223 3.66 0.0003
## lnrad 2.7215 0.6116 4.45 <0.0001
## ptratio 0.3439 0.1200 2.87 0.0042
## black -0.0103 0.0059 -1.75 0.0798
## lstat 0.0281 0.0524 0.54 0.5917
## medv 0.2231 0.0706 3.16 0.0016
##
crimetraining$target1<-crimetraining$target
crimetraining$target1<-as.numeric(crimetraining$target1)
crimetraining$target1[crimetraining$target1=="0"] <- ".00001"
ols1=lm(target1 ~ zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv, data = crimetraining)
boxcox(ols1)
crime.step<-stepAIC(logit, direction = c("both", "backward", "forward"))
## Start: AIC=214.02
## target ~ zn + indus + nox + rm + age + dis + rad + tax + ptratio +
## black + lstat + medv
##
## Df Deviance AIC
## - rm 1 188.93 212.93
## - lstat 1 189.17 213.17
## - indus 1 189.62 213.62
## <none> 188.02 214.02
## - zn 1 193.38 217.38
## - black 1 193.53 217.53
## - tax 1 194.72 218.72
## - age 1 195.27 219.27
## - medv 1 197.30 221.30
## - ptratio 1 199.31 223.31
## - dis 1 199.77 223.77
## - rad 1 231.89 255.89
## - nox 1 257.55 281.55
##
## Step: AIC=212.93
## target ~ zn + indus + nox + age + dis + rad + tax + ptratio +
## black + lstat + medv
##
## Df Deviance AIC
## - indus 1 190.51 212.51
## <none> 188.93 212.93
## - lstat 1 191.35 213.35
## + rm 1 188.02 214.02
## - black 1 194.24 216.24
## - zn 1 194.83 216.83
## - age 1 195.56 217.56
## - tax 1 196.11 218.11
## - ptratio 1 199.41 221.41
## - dis 1 199.80 221.80
## - medv 1 203.37 225.37
## - rad 1 232.21 254.21
## - nox 1 257.56 279.56
##
## Step: AIC=212.51
## target ~ zn + nox + age + dis + rad + tax + ptratio + black +
## lstat + medv
##
## Df Deviance AIC
## <none> 190.51 212.51
## - lstat 1 192.57 212.57
## + indus 1 188.93 212.93
## + rm 1 189.62 213.62
## - black 1 195.51 215.51
## - zn 1 196.67 216.67
## - age 1 196.98 216.98
## - ptratio 1 200.51 220.51
## - dis 1 201.23 221.23
## - tax 1 202.79 222.79
## - medv 1 204.52 224.52
## - rad 1 240.50 260.50
## - nox 1 261.63 281.63
crime.step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## target ~ zn + indus + nox + rm + age + dis + rad + tax + ptratio +
## black + lstat + medv
##
## Final Model:
## target ~ zn + nox + age + dis + rad + tax + ptratio + black +
## lstat + medv
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 453 188.0190 214.0190
## 2 - rm 1 0.9131359 454 188.9321 212.9321
## 3 - indus 1 1.5819995 455 190.5141 212.5141
Model Selection
Not that we’ve analyzed the results from our model, it’s time to start building prediction, and applying them to our test data set.
Based on a threshold of .5, our model is yielding a measures of prediction (accuracy, CER, precision, sensitivity and specificity), all in the area of .9 (see output below). This indicates that our prediction is very good, and hopefully it yield similar prevalence results (incidence of response value “yes”) in the training and test data sets. Also, our AUC value is .97, an indicator of the strength of our predictive model, because it tells us how well we are separate the positive and negative response groups.
It’s worth noting that our custom functions for prediction measures, match what is produced in the confusionMatrix package.
Finally, after applying our prediction values to the test data set, we’re able to compare the prevalence number from our training (see confusionMatrix) and our manual calculation from our test data sets and they are virtually the same (.49 vs. .50).
See the results below.
knitr::opts_chunk$set(error = TRUE)
logitfinal=glm(target ~ zn+nox+rm+age+dis+lnrad+ptratio+black+medv, data = crimetraining, family = binomial(link = "logit"))
logitfinallrm=lrm(target ~ zn+nox+rm+age+dis+lnrad+ptratio+black+medv, data = crimetraining)
logitfinallrm
## Logistic Regression Model
##
## lrm(formula = target ~ zn + nox + rm + age + dis + lnrad + ptratio +
## black + medv, data = crimetraining)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 466 LR chi2 444.83 R2 0.820 C 0.970
## 0 237 d.f. 9 g 7.511 Dxy 0.941
## 1 229 Pr(> chi2) <0.0001 gr 1827.933 gamma 0.942
## max |deriv| 2e-05 gp 0.471 tau-a 0.471
## Brier 0.068
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -32.7537 6.2282 -5.26 <0.0001
## zn -0.0712 0.0318 -2.24 0.0250
## nox 38.6106 6.3523 6.08 <0.0001
## rm -0.9085 0.6657 -1.36 0.1723
## age 0.0375 0.0120 3.12 0.0018
## dis 0.8210 0.2229 3.68 0.0002
## lnrad 2.6980 0.6024 4.48 <0.0001
## ptratio 0.3450 0.1200 2.87 0.0041
## black -0.0103 0.0059 -1.74 0.0817
## medv 0.2303 0.0692 3.33 0.0009
##
#model evaluation
logit.prob=predict(logitfinal,type = "response")
logit.pred=ifelse(logit.prob>=0.5,"1","0")
crimetraining$pred <- logit.pred
crimetraining$pred<-as.numeric(crimetraining$pred)
is.numeric(crimetraining$pred)
## [1] TRUE
crimetraining$prob <- logit.prob
crimetraining$prob <- round(crimetraining$prob, digits = 2)
crimetraining <- dplyr::mutate(crimetraining, compare = crimetraining$target + crimetraining$pred)
TP<- nrow(crimetraining[(crimetraining$compare ==2 & crimetraining$pred == 1),])
FP <- nrow(crimetraining[(crimetraining$compare ==1 & crimetraining$pred == 1),])
TN <- nrow(crimetraining[(crimetraining$compare ==0 & crimetraining$pred == 0),])
FN <- nrow(crimetraining[(crimetraining$compare ==1 & crimetraining$pred == 0),])
myAccuracy <- function() {(TP + TN)/(TP+TN+FP+FN)}
myCER <- function() {(FP + FN)/(TP+FP+TN+FN)}
myPrecision <- function() {TP/(TP + FP)}
mySensitivity <- function() {TP/(TP + FN)}
mySpecificity <- function() {TN/(TN + FP)}
Precision <- TP/(TP + FP)
Sensitivity <- TP/(TP + FN)
myF1Score <- function() {(2 * Precision * Sensitivity)/(Precision + Sensitivity)
}
myAllFunction <- function(j) {(j = c(myAccuracy(),myCER(),myPrecision(),mySensitivity(),mySpecificity(),myF1Score()))}
allfunctions <-c(myAllFunction())
names(allfunctions) = c("Accuracy", "CER", "Precision", "Sensitivity", "Specificity", "F1")
allfunctions
## Accuracy CER Precision Sensitivity Specificity F1
## 0.90772532 0.09227468 0.91891892 0.89082969 0.92405063 0.90465632
plot(roc(crimetraining$target, crimetraining$prob, direction="<"),col="blue", lwd=3, main="Classification", print.auc=TRUE)
confusionMatrix(crimetraining$pred, crimetraining$target, positive = "1", dnn = c("Prediction", "Reference"), prevalence = NULL)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 219 25
## 1 18 204
##
## Accuracy : 0.9077
## 95% CI : (0.8777, 0.9324)
## No Information Rate : 0.5086
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8153
## Mcnemar's Test P-Value : 0.3602
##
## Sensitivity : 0.8908
## Specificity : 0.9241
## Pos Pred Value : 0.9189
## Neg Pred Value : 0.8975
## Prevalence : 0.4914
## Detection Rate : 0.4378
## Detection Prevalence : 0.4764
## Balanced Accuracy : 0.9074
##
## 'Positive' Class : 1
##
#add log(rad)
crimeeval$lnrad<-log(crimeeval$rad)
logit.evalprob <- round(predict(logitfinal, crimeeval, type = "response"), digits = 2)
logit.evalprob
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 0.05 0.64 0.63 0.64 0.09 0.23 0.24 0.01 0.00 0.00 0.14 0.12 0.84 0.96 0.75
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 0.18 0.33 0.98 0.15 0.00 0.00 0.02 0.11 0.20 0.17 0.53 0.00 1.00 1.00 1.00
## 31 32 33 34 35 36 37 38 39 40
## 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 0.76 0.04
logit.evalpred <- ifelse(logit.evalprob>=0.5,"1","0")
logit.evalpred
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## "0" "1" "1" "1" "0" "0" "0" "0" "0" "0" "0" "0" "1" "1" "1" "0" "0" "1"
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## "0" "0" "0" "0" "0" "0" "0" "1" "0" "1" "1" "1" "1" "1" "1" "1" "1" "1"
## 37 38 39 40
## "1" "1" "1" "0"
logit.evalpred<-as.numeric(logit.evalpred)
round(sum(logit.evalpred)/length(logit.evalpred) , digits = 2)
## [1] 0.5
Source: Why Data Scientists Split Data into Train and Test
Source: AIC Multicollinearity
Source: What is a Logit Function and Why Use Logistic Regression