Abstract

This assignment focused on census type data for the City of Boston. The dataset contains 466 records that encompass the entire area. The variables for the data are housing type variables such as number of rooms but the records do not give an indication of the part of the city it is representing. The purpose for this assignment is to analyze the data, perform any data manipulation / clean-up and build three (3) binary logistic regression models using only the data (or derivatives thereof) to predict if the region is above or below the median crime rate. The chosen model provided an AIC = 251.59.

Keywords: crime, data621

Data Exploration

knitr::opts_chunk$set(echo = TRUE)
library(e1071)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(purrr)
library(tidyr)
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 3.4.4
library(VIF)
library(knitr)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.4.4
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.4.4
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following object is masked from 'package:e1071':
## 
##     impute
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(pROC)
## Warning: package 'pROC' was built under R version 3.4.4
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# read data
train = read.csv(file="data/crime-training-data.csv")
train2<-train
dim(train)
## [1] 466  14
#check data
summary(train) %>% kable() %>% kable_styling()
   zn </th>
 indus </th>
  chas </th>
  nox </th>
   rm </th>
  age </th>
  dis </th>
  rad </th>
  tax </th>
ptratio </th>
 black </th>
 lstat </th>
  medv </th>
 target </th>
Min. : 0.00 Min. : 0.460 Min. :0.00000 Min. :0.3890 Min. :3.863 Min. : 2.90 Min. : 1.130 Min. : 1.00 Min. :187.0 Min. :12.6 Min. : 0.32 Min. : 1.730 Min. : 5.00 Min. :0.0000
1st Qu.: 0.00 1st Qu.: 5.145 1st Qu.:0.00000 1st Qu.:0.4480 1st Qu.:5.887 1st Qu.: 43.88 1st Qu.: 2.101 1st Qu.: 4.00 1st Qu.:281.0 1st Qu.:16.9 1st Qu.:375.61 1st Qu.: 7.043 1st Qu.:17.02 1st Qu.:0.0000
Median : 0.00 Median : 9.690 Median :0.00000 Median :0.5380 Median :6.210 Median : 77.15 Median : 3.191 Median : 5.00 Median :334.5 Median :18.9 Median :391.34 Median :11.350 Median :21.20 Median :0.0000
Mean : 11.58 Mean :11.105 Mean :0.07082 Mean :0.5543 Mean :6.291 Mean : 68.37 Mean : 3.796 Mean : 9.53 Mean :409.5 Mean :18.4 Mean :357.12 Mean :12.631 Mean :22.59 Mean :0.4914
3rd Qu.: 16.25 3rd Qu.:18.100 3rd Qu.:0.00000 3rd Qu.:0.6240 3rd Qu.:6.630 3rd Qu.: 94.10 3rd Qu.: 5.215 3rd Qu.:24.00 3rd Qu.:666.0 3rd Qu.:20.2 3rd Qu.:396.24 3rd Qu.:16.930 3rd Qu.:25.00 3rd Qu.:1.0000
Max. :100.00 Max. :27.740 Max. :1.00000 Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.00 Max. :711.0 Max. :22.0 Max. :396.90 Max. :37.970 Max. :50.00 Max. :1.0000
sapply(train, function(x) sum(is.na(x)))
##      zn   indus    chas     nox      rm     age     dis     rad     tax 
##       0       0       0       0       0       0       0       0       0 
## ptratio   black   lstat    medv  target 
##       0       0       0       0       0
ntrain<-select_if(train, is.numeric)
ntrain %>%
  keep(is.numeric) %>%                     # Keep only numeric columns
  gather() %>%                             # Convert to key-value pairs
  ggplot(aes(value)) +                     # Plot the values
    facet_wrap(~ key, scales = "free") +   # In separate panels
    geom_density()  

rcorr(as.matrix(train))
##            zn indus  chas   nox    rm   age   dis   rad   tax ptratio
## zn       1.00 -0.54 -0.04 -0.52  0.32 -0.57  0.66 -0.32 -0.32   -0.39
## indus   -0.54  1.00  0.06  0.76 -0.39  0.64 -0.70  0.60  0.73    0.39
## chas    -0.04  0.06  1.00  0.10  0.09  0.08 -0.10 -0.02 -0.05   -0.13
## nox     -0.52  0.76  0.10  1.00 -0.30  0.74 -0.77  0.60  0.65    0.18
## rm       0.32 -0.39  0.09 -0.30  1.00 -0.23  0.20 -0.21 -0.30   -0.36
## age     -0.57  0.64  0.08  0.74 -0.23  1.00 -0.75  0.46  0.51    0.26
## dis      0.66 -0.70 -0.10 -0.77  0.20 -0.75  1.00 -0.49 -0.53   -0.23
## rad     -0.32  0.60 -0.02  0.60 -0.21  0.46 -0.49  1.00  0.91    0.47
## tax     -0.32  0.73 -0.05  0.65 -0.30  0.51 -0.53  0.91  1.00    0.47
## ptratio -0.39  0.39 -0.13  0.18 -0.36  0.26 -0.23  0.47  0.47    1.00
## black    0.18 -0.36  0.04 -0.38  0.13 -0.27  0.29 -0.45 -0.44   -0.18
## lstat   -0.43  0.61 -0.05  0.60 -0.63  0.61 -0.51  0.50  0.56    0.38
## medv     0.38 -0.50  0.16 -0.43  0.71 -0.38  0.26 -0.40 -0.49   -0.52
## target  -0.43  0.60  0.08  0.73 -0.15  0.63 -0.62  0.63  0.61    0.25
##         black lstat  medv target
## zn       0.18 -0.43  0.38  -0.43
## indus   -0.36  0.61 -0.50   0.60
## chas     0.04 -0.05  0.16   0.08
## nox     -0.38  0.60 -0.43   0.73
## rm       0.13 -0.63  0.71  -0.15
## age     -0.27  0.61 -0.38   0.63
## dis      0.29 -0.51  0.26  -0.62
## rad     -0.45  0.50 -0.40   0.63
## tax     -0.44  0.56 -0.49   0.61
## ptratio -0.18  0.38 -0.52   0.25
## black    1.00 -0.35  0.33  -0.35
## lstat   -0.35  1.00 -0.74   0.47
## medv     0.33 -0.74  1.00  -0.27
## target  -0.35  0.47 -0.27   1.00
## 
## n= 466 
## 
## 
## P
##         zn     indus  chas   nox    rm     age    dis    rad    tax   
## zn             0.0000 0.3870 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## indus   0.0000        0.1874 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## chas    0.3870 0.1874        0.0355 0.0509 0.0890 0.0372 0.7321 0.3138
## nox     0.0000 0.0000 0.0355        0.0000 0.0000 0.0000 0.0000 0.0000
## rm      0.0000 0.0000 0.0509 0.0000        0.0000 0.0000 0.0000 0.0000
## age     0.0000 0.0000 0.0890 0.0000 0.0000        0.0000 0.0000 0.0000
## dis     0.0000 0.0000 0.0372 0.0000 0.0000 0.0000        0.0000 0.0000
## rad     0.0000 0.0000 0.7321 0.0000 0.0000 0.0000 0.0000        0.0000
## tax     0.0000 0.0000 0.3138 0.0000 0.0000 0.0000 0.0000 0.0000       
## ptratio 0.0000 0.0000 0.0054 0.0001 0.0000 0.0000 0.0000 0.0000 0.0000
## black   0.0000 0.0000 0.3384 0.0000 0.0041 0.0000 0.0000 0.0000 0.0000
## lstat   0.0000 0.0000 0.2679 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## medv    0.0000 0.0000 0.0005 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## target  0.0000 0.0000 0.0843 0.0000 0.0010 0.0000 0.0000 0.0000 0.0000
##         ptratio black  lstat  medv   target
## zn      0.0000  0.0000 0.0000 0.0000 0.0000
## indus   0.0000  0.0000 0.0000 0.0000 0.0000
## chas    0.0054  0.3384 0.2679 0.0005 0.0843
## nox     0.0001  0.0000 0.0000 0.0000 0.0000
## rm      0.0000  0.0041 0.0000 0.0000 0.0010
## age     0.0000  0.0000 0.0000 0.0000 0.0000
## dis     0.0000  0.0000 0.0000 0.0000 0.0000
## rad     0.0000  0.0000 0.0000 0.0000 0.0000
## tax     0.0000  0.0000 0.0000 0.0000 0.0000
## ptratio         0.0000 0.0000 0.0000 0.0000
## black   0.0000         0.0000 0.0000 0.0000
## lstat   0.0000  0.0000        0.0000 0.0000
## medv    0.0000  0.0000 0.0000        0.0000
## target  0.0000  0.0000 0.0000 0.0000
corrplot(cor(train), method="square")

# correlation test 1
cor.test(train$rad,train$tax,method="pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  train$rad and train$tax
## t = 46.239, df = 464, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8888115 0.9214292
## sample estimates:
##       cor 
## 0.9064632
#significant ignore

Data Preparation

# transform data using log for skewed

train$zn <- log(train$zn+1)
train$black <- log(train$black+1)
train$chas <- log(train$chas+1)


#remove rad per correlation in prior section

train <- train[, !(colnames(train) %in% c("rad"))]

#create variable
train$new <- train$tax / (train$medv*10)

rcorr(as.matrix(train))
##            zn indus  chas   nox    rm   age   dis   tax ptratio black
## zn       1.00 -0.60 -0.04 -0.55  0.34 -0.58  0.69 -0.40   -0.46  0.17
## indus   -0.60  1.00  0.06  0.76 -0.39  0.64 -0.70  0.73    0.39 -0.28
## chas    -0.04  0.06  1.00  0.10  0.09  0.08 -0.10 -0.05   -0.13  0.05
## nox     -0.55  0.76  0.10  1.00 -0.30  0.74 -0.77  0.65    0.18 -0.29
## rm       0.34 -0.39  0.09 -0.30  1.00 -0.23  0.20 -0.30   -0.36  0.09
## age     -0.58  0.64  0.08  0.74 -0.23  1.00 -0.75  0.51    0.26 -0.21
## dis      0.69 -0.70 -0.10 -0.77  0.20 -0.75  1.00 -0.53   -0.23  0.23
## tax     -0.40  0.73 -0.05  0.65 -0.30  0.51 -0.53  1.00    0.47 -0.38
## ptratio -0.46  0.39 -0.13  0.18 -0.36  0.26 -0.23  0.47    1.00 -0.18
## black    0.17 -0.28  0.05 -0.29  0.09 -0.21  0.23 -0.38   -0.18  1.00
## lstat   -0.46  0.61 -0.05  0.60 -0.63  0.61 -0.51  0.56    0.38 -0.28
## medv     0.40 -0.50  0.16 -0.43  0.71 -0.38  0.26 -0.49   -0.52  0.29
## target  -0.47  0.60  0.08  0.73 -0.15  0.63 -0.62  0.61    0.25 -0.28
## new     -0.37  0.62 -0.11  0.59 -0.39  0.49 -0.48  0.80    0.45 -0.41
##         lstat  medv target   new
## zn      -0.46  0.40  -0.47 -0.37
## indus    0.61 -0.50   0.60  0.62
## chas    -0.05  0.16   0.08 -0.11
## nox      0.60 -0.43   0.73  0.59
## rm      -0.63  0.71  -0.15 -0.39
## age      0.61 -0.38   0.63  0.49
## dis     -0.51  0.26  -0.62 -0.48
## tax      0.56 -0.49   0.61  0.80
## ptratio  0.38 -0.52   0.25  0.45
## black   -0.28  0.29  -0.28 -0.41
## lstat    1.00 -0.74   0.47  0.72
## medv    -0.74  1.00  -0.27 -0.69
## target   0.47 -0.27   1.00  0.49
## new      0.72 -0.69   0.49  1.00
## 
## n= 466 
## 
## 
## P
##         zn     indus  chas   nox    rm     age    dis    tax    ptratio
## zn             0.0000 0.4059 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
## indus   0.0000        0.1874 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
## chas    0.4059 0.1874        0.0355 0.0509 0.0890 0.0372 0.3138 0.0054 
## nox     0.0000 0.0000 0.0355        0.0000 0.0000 0.0000 0.0000 0.0001 
## rm      0.0000 0.0000 0.0509 0.0000        0.0000 0.0000 0.0000 0.0000 
## age     0.0000 0.0000 0.0890 0.0000 0.0000        0.0000 0.0000 0.0000 
## dis     0.0000 0.0000 0.0372 0.0000 0.0000 0.0000        0.0000 0.0000 
## tax     0.0000 0.0000 0.3138 0.0000 0.0000 0.0000 0.0000        0.0000 
## ptratio 0.0000 0.0000 0.0054 0.0001 0.0000 0.0000 0.0000 0.0000        
## black   0.0002 0.0000 0.2570 0.0000 0.0642 0.0000 0.0000 0.0000 0.0000 
## lstat   0.0000 0.0000 0.2679 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
## medv    0.0000 0.0000 0.0005 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
## target  0.0000 0.0000 0.0843 0.0000 0.0010 0.0000 0.0000 0.0000 0.0000 
## new     0.0000 0.0000 0.0223 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
##         black  lstat  medv   target new   
## zn      0.0002 0.0000 0.0000 0.0000 0.0000
## indus   0.0000 0.0000 0.0000 0.0000 0.0000
## chas    0.2570 0.2679 0.0005 0.0843 0.0223
## nox     0.0000 0.0000 0.0000 0.0000 0.0000
## rm      0.0642 0.0000 0.0000 0.0010 0.0000
## age     0.0000 0.0000 0.0000 0.0000 0.0000
## dis     0.0000 0.0000 0.0000 0.0000 0.0000
## tax     0.0000 0.0000 0.0000 0.0000 0.0000
## ptratio 0.0000 0.0000 0.0000 0.0000 0.0000
## black          0.0000 0.0000 0.0000 0.0000
## lstat   0.0000        0.0000 0.0000 0.0000
## medv    0.0000 0.0000        0.0000 0.0000
## target  0.0000 0.0000 0.0000        0.0000
## new     0.0000 0.0000 0.0000 0.0000
corrplot(cor(train), method="square")

Build Models

#MODEL 1
logit <- glm(target ~ ., data=train, family = "binomial" (link="logit"))
summary(logit)
## 
## Call:
## glm(formula = target ~ ., family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1097  -0.2702  -0.0188   0.1965   3.5752  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -33.029280   8.409645  -3.928 8.58e-05 ***
## zn           -0.681913   0.248420  -2.745 0.006051 ** 
## indus        -0.152009   0.049530  -3.069 0.002148 ** 
## chas          2.500407   0.970167   2.577 0.009958 ** 
## nox          51.588709   7.272073   7.094 1.30e-12 ***
## rm           -0.093566   0.620763  -0.151 0.880190    
## age           0.024907   0.012537   1.987 0.046958 *  
## dis           0.922461   0.220580   4.182 2.89e-05 ***
## tax           0.008238   0.002404   3.427 0.000611 ***
## ptratio       0.298144   0.113540   2.626 0.008642 ** 
## black        -1.785172   1.031215  -1.731 0.083428 .  
## lstat         0.087574   0.052940   1.654 0.098088 .  
## medv          0.181463   0.061169   2.967 0.003011 ** 
## new          -0.425892   0.194726  -2.187 0.028732 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 223.59  on 452  degrees of freedom
## AIC: 251.59
## 
## Number of Fisher Scoring iterations: 8
exp(logit$coefficients)
##  (Intercept)           zn        indus         chas          nox 
## 4.524451e-15 5.056487e-01 8.589803e-01 1.218746e+01 2.539168e+22 
##           rm          age          dis          tax      ptratio 
## 9.106778e-01 1.025219e+00 2.515473e+00 1.008272e+00 1.347356e+00 
##        black        lstat         medv          new 
## 1.677682e-01 1.091523e+00 1.198970e+00 6.531872e-01
logitscalar <- mean(dlogis(predict(logit, type = "link")))
logitscalar * coef(logit)
##   (Intercept)            zn         indus          chas           nox 
## -2.5169305639 -0.0519638271 -0.0115835637  0.1905385638  3.9312148611 
##            rm           age           dis           tax       ptratio 
## -0.0071300236  0.0018979633  0.0702942880  0.0006277545  0.0227194938 
##         black         lstat          medv           new 
## -0.1360354839  0.0066733801  0.0138280224 -0.0324542205
confint.default(logit)
##                     2.5 %       97.5 %
## (Intercept) -4.951188e+01 -16.54667903
## zn          -1.168807e+00  -0.19501926
## indus       -2.490869e-01  -0.05493168
## chas         5.989156e-01   4.40189905
## nox          3.733571e+01  65.84171053
## rm          -1.310238e+00   1.12310610
## age          3.349061e-04   0.04947844
## dis          4.901328e-01   1.35478874
## tax          3.526349e-03   0.01294950
## ptratio      7.560983e-02   0.52067879
## black       -3.806316e+00   0.23597177
## lstat       -1.618741e-02   0.19133483
## medv         6.157454e-02   0.30135134
## new         -8.075467e-01  -0.04423648
predlogit <- predict(logit, type="response")
train2$pred1 <- predict(logit, type="response")
summary(predlogit)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0001109 0.0309268 0.4822112 0.4914163 0.9714183 1.0000000
table(true = train$target, pred = round(fitted(logit)))
##     pred
## true   0   1
##    0 213  24
##    1  22 207
#plots for Model 1
ggplot(data = train, aes(x = age, y = target)) + 
    geom_point(color = 'grey50') + 
    geom_point(data = train2, aes(x = age, y = pred1), color = 'red', size = 0.3) + 
    theme_bw()

data.frame(train2$pred1) %>%
    ggplot(aes(x = train2.pred1)) + 
    geom_histogram(bins = 50, fill = 'grey50') +
    labs(title = 'Histogram of Predictions') +
    theme_bw()

plot.roc(train$target, train2$pred1)

#extract variables that are significant and rerun model
sigvars <- data.frame(summary(logit)$coef[summary(logit)$coef[,4] <= .05, 4])
sigvars <- add_rownames(sigvars, "vars")
## Warning: Deprecated, use tibble::rownames_to_column() instead.
colist<-dplyr::pull(sigvars, vars)
colist<-colist[2:11]

idx <- match(colist, names(train))
trainmod2 <- cbind(train[,idx], train2['target'])

#MODEL 2
logit2 <- glm(target ~ ., data=trainmod2, family = "binomial" (link="logit"))
summary(logit2)
## 
## Call:
## glm(formula = target ~ ., family = binomial(link = "logit"), 
##     data = trainmod2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3341  -0.3000  -0.0271   0.2216   3.5120  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -40.559519   5.624242  -7.212 5.53e-13 ***
## zn           -0.570579   0.227004  -2.514 0.011953 *  
## indus        -0.141446   0.049062  -2.883 0.003939 ** 
## chas          2.791745   0.970675   2.876 0.004026 ** 
## nox          48.903546   6.885627   7.102 1.23e-12 ***
## age           0.027388   0.010057   2.723 0.006465 ** 
## dis           0.804755   0.203539   3.954 7.69e-05 ***
## tax           0.008311   0.002432   3.418 0.000632 ***
## ptratio       0.289929   0.107363   2.700 0.006925 ** 
## medv          0.130716   0.033228   3.934 8.36e-05 ***
## new          -0.280439   0.208775  -1.343 0.179187    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 234.26  on 455  degrees of freedom
## AIC: 256.26
## 
## Number of Fisher Scoring iterations: 7
exp(logit2$coefficients)
##  (Intercept)           zn        indus         chas          nox 
## 2.427866e-18 5.651983e-01 8.681022e-01 1.630945e+01 1.731970e+21 
##          age          dis          tax      ptratio         medv 
## 1.027766e+00 2.236148e+00 1.008346e+00 1.336332e+00 1.139644e+00 
##          new 
## 7.554517e-01
logit2scalar <- mean(dlogis(predict(logit2, type = "link")))
logit2scalar * coef(logit2)
##   (Intercept)            zn         indus          chas           nox 
## -3.2356595453 -0.0455182453 -0.0112839288  0.2227131005  3.9013092282 
##           age           dis           tax       ptratio          medv 
##  0.0021848724  0.0641997863  0.0006630314  0.0231292172  0.0104279150 
##           new 
## -0.0223722213
predlogit2 <- predict(logit2, type="response")
train2$pred2 <- predict(logit2, type="response")

summary(predlogit2)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0002271 0.0330183 0.4686400 0.4914163 0.9662585 0.9999995
table(true = train$target, pred = round(fitted(logit2)))
##     pred
## true   0   1
##    0 214  23
##    1  25 204
#plots for Model 2
ggplot(data = train, aes(x = age, y = target)) + 
    geom_point(color = 'grey50') + 
    geom_point(data = train2, aes(x = age, y = pred2), color = 'red', size = 0.3) + 
    theme_bw()

data.frame(train2$pred2) %>%
    ggplot(aes(x = train2.pred2)) + 
    geom_histogram(bins = 50, fill = 'grey50') +
    labs(title = 'Histogram of Predictions') +
    theme_bw()

plot.roc(train$target, train2$pred2)

#MODEL 3
#PC Model no racial bias
logit3<-model <- glm(target ~ zn + indus + nox + rm + age + dis + tax + ptratio + medv + new, data=train, family = "binomial" (link="logit"))
summary(logit3)
## 
## Call:
## glm(formula = target ~ zn + indus + nox + rm + age + dis + tax + 
##     ptratio + medv + new, family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3623  -0.3016  -0.0318   0.2470   3.4208  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -35.640886   5.181413  -6.879 6.04e-12 ***
## zn           -0.607165   0.230874  -2.630 0.008542 ** 
## indus        -0.122350   0.045692  -2.678 0.007413 ** 
## nox          44.778082   6.398918   6.998 2.60e-12 ***
## rm           -0.332101   0.508476  -0.653 0.513673    
## age           0.030652   0.010570   2.900 0.003732 ** 
## dis           0.771242   0.203098   3.797 0.000146 ***
## tax           0.007710   0.002318   3.326 0.000880 ***
## ptratio       0.231692   0.105821   2.189 0.028563 *  
## medv          0.151192   0.054648   2.767 0.005663 ** 
## new          -0.250426   0.197261  -1.270 0.204257    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 242.17  on 455  degrees of freedom
## AIC: 264.17
## 
## Number of Fisher Scoring iterations: 7
exp(logit3$coefficients)
##  (Intercept)           zn        indus          nox           rm 
## 3.321695e-16 5.448937e-01 8.848385e-01 2.798168e+19 7.174150e-01 
##          age          dis          tax      ptratio         medv 
## 1.031126e+00 2.162451e+00 1.007740e+00 1.260731e+00 1.163220e+00 
##          new 
## 7.784692e-01
predlogit3 <- predict(logit3, type="response")
train2$pred3 <- predict(logit3, type="response")
summary(predlogit3)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0003552 0.0393321 0.4672500 0.4914163 0.9610074 0.9999973
table(true = train$target, pred = round(fitted(logit3)))
##     pred
## true   0   1
##    0 210  27
##    1  26 203
#plots for Model 2
ggplot(data = train, aes(x = age, y = target)) + 
    geom_point(color = 'grey50') + 
    geom_point(data = train2, aes(x = age, y = pred3), color = 'red', size = 0.3) + 
    theme_bw()

data.frame(train2$pred3) %>%
    ggplot(aes(x = train2.pred3)) + 
    geom_histogram(bins = 50, fill = 'grey50') +
    labs(title = 'Histogram of Predictions') +
    theme_bw()

plot.roc(train$target, train2$pred3)

logit3scalar <- mean(dlogis(predict(logit3, type = "link")))
logit3scalar * coef(logit3)
##   (Intercept)            zn         indus           nox            rm 
## -2.9404579504 -0.0500925172 -0.0100941744  3.6942983150 -0.0273991093 
##           age           dis           tax       ptratio          medv 
##  0.0025288302  0.0636293182  0.0006361103  0.0191151040  0.0124736734 
##           new 
## -0.0206607316
round(logitscalar * coef(logit),2)
## (Intercept)          zn       indus        chas         nox          rm 
##       -2.52       -0.05       -0.01        0.19        3.93       -0.01 
##         age         dis         tax     ptratio       black       lstat 
##        0.00        0.07        0.00        0.02       -0.14        0.01 
##        medv         new 
##        0.01       -0.03
round(logit2scalar * coef(logit2),2)
## (Intercept)          zn       indus        chas         nox         age 
##       -3.24       -0.05       -0.01        0.22        3.90        0.00 
##         dis         tax     ptratio        medv         new 
##        0.06        0.00        0.02        0.01       -0.02
round(logit3scalar * coef(logit3),2)
## (Intercept)          zn       indus         nox          rm         age 
##       -2.94       -0.05       -0.01        3.69       -0.03        0.00 
##         dis         tax     ptratio        medv         new 
##        0.06        0.00        0.02        0.01       -0.02

Select Models

test = read.csv(file="data/crime-evaluation-data.csv")
test2<- test
dim(test)
## [1] 40 13
# transform data using log for skewed

test$zn <- log(test$zn+1)
test$black <- log(test$black+1)
test$chas <- log(test$chas+1)


#remove rad per correlation in prior section

test <- test[, !(colnames(test) %in% c("rad"))]

#create variable
test$new <- test$tax / (test$medv*10)

idx <- match(colist, names(test))
testNorm2 <- test[,idx]

summary(testNorm2)
##        zn             indus             chas              nox        
##  Min.   :0.0000   Min.   : 1.760   Min.   :0.00000   Min.   :0.3850  
##  1st Qu.:0.0000   1st Qu.: 5.692   1st Qu.:0.00000   1st Qu.:0.4713  
##  Median :0.0000   Median : 8.915   Median :0.00000   Median :0.5380  
##  Mean   :0.6619   Mean   :11.507   Mean   :0.03466   Mean   :0.5592  
##  3rd Qu.:0.0000   3rd Qu.:18.100   3rd Qu.:0.00000   3rd Qu.:0.6258  
##  Max.   :4.5109   Max.   :25.650   Max.   :0.69315   Max.   :0.7400  
##       age              dis             tax           ptratio     
##  Min.   :  6.80   Min.   :1.202   Min.   :188.0   Min.   :14.70  
##  1st Qu.: 56.62   1st Qu.:2.041   1st Qu.:276.8   1st Qu.:18.40  
##  Median : 83.25   Median :3.373   Median :307.0   Median :19.60  
##  Mean   : 70.99   Mean   :3.787   Mean   :393.5   Mean   :19.12  
##  3rd Qu.: 93.10   3rd Qu.:4.527   3rd Qu.:666.0   3rd Qu.:20.20  
##  Max.   :100.00   Max.   :9.089   Max.   :666.0   Max.   :21.20  
##       medv            new        
##  Min.   : 8.40   Min.   :0.6356  
##  1st Qu.:16.98   1st Qu.:1.1427  
##  Median :20.55   Median :1.3538  
##  Mean   :21.88   Mean   :2.2218  
##  3rd Qu.:25.00   3rd Qu.:2.5804  
##  Max.   :50.00   Max.   :7.9286
modelfinal <- predict(logit2, newdata = testNorm2, type="response")

y_pred_num <- ifelse(modelfinal > 0.5, 1, 0)
y_pred <- factor(y_pred_num, levels=c(0, 1))
summary(y_pred)
##  0  1 
## 21 19
rbind(round(summary(predlogit2),4), round(summary(modelfinal),4)) %>% kable()
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0002 0.0330 0.4686 0.4914 0.9663 1
0.0011 0.0515 0.4513 0.4887 0.9067 1