Data 621

Abstract

This assignment focused on census type data for the City of Boston. The dataset contains 466 records that encompass the entire area. The variables for the data are housing type variables such as number of rooms but the records do not give an indication of the part of the city it is representing. The purpose for this assignment is to analyze the data, perform any data manipulation / clean-up and build three (3) binary logistic regression models using only the data (or derivatives thereof) to predict if the region is above or below the median crime rate. The chosen model provided an AIC = 251.59.

Keywords: crime, data621

Data Exploration

knitr::opts_chunk$set(echo = TRUE)
library(e1071)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(purrr)
library(tidyr)
library(ggplot2)
library(corrplot)

## corrplot 0.84 loaded

library(FactoMineR)

## Warning: package 'FactoMineR' was built under R version 3.4.4

library(VIF)
library(knitr)
library(kableExtra)

## Warning: package 'kableExtra' was built under R version 3.4.4

library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## Warning: package 'Formula' was built under R version 3.4.4

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following object is masked from 'package:e1071':
## 
##     impute

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(pROC)

## Warning: package 'pROC' was built under R version 3.4.4

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# read data
train = read.csv(file="data/crime-training-data.csv")
train2<-train
dim(train)

## [1] 466  14

#check data
summary(train) %>% kable() %>% kable_styling()

`zn </th>`	`indus </th>`	`chas </th>`	`nox </th>`	`rm </th>`	`age </th>`	`dis </th>`	`rad </th>`	`tax </th>`	`ptratio </th>`	`black </th>`	`lstat </th>`	`medv </th>`	`target </th>`
Min. : 0.00	Min. : 0.460	Min. :0.00000	Min. :0.3890	Min. :3.863	Min. : 2.90	Min. : 1.130	Min. : 1.00	Min. :187.0	Min. :12.6	Min. : 0.32	Min. : 1.730	Min. : 5.00	Min. :0.0000
1st Qu.: 0.00	1st Qu.: 5.145	1st Qu.:0.00000	1st Qu.:0.4480	1st Qu.:5.887	1st Qu.: 43.88	1st Qu.: 2.101	1st Qu.: 4.00	1st Qu.:281.0	1st Qu.:16.9	1st Qu.:375.61	1st Qu.: 7.043	1st Qu.:17.02	1st Qu.:0.0000
Median : 0.00	Median : 9.690	Median :0.00000	Median :0.5380	Median :6.210	Median : 77.15	Median : 3.191	Median : 5.00	Median :334.5	Median :18.9	Median :391.34	Median :11.350	Median :21.20	Median :0.0000
Mean : 11.58	Mean :11.105	Mean :0.07082	Mean :0.5543	Mean :6.291	Mean : 68.37	Mean : 3.796	Mean : 9.53	Mean :409.5	Mean :18.4	Mean :357.12	Mean :12.631	Mean :22.59	Mean :0.4914
3rd Qu.: 16.25	3rd Qu.:18.100	3rd Qu.:0.00000	3rd Qu.:0.6240	3rd Qu.:6.630	3rd Qu.: 94.10	3rd Qu.: 5.215	3rd Qu.:24.00	3rd Qu.:666.0	3rd Qu.:20.2	3rd Qu.:396.24	3rd Qu.:16.930	3rd Qu.:25.00	3rd Qu.:1.0000
Max. :100.00	Max. :27.740	Max. :1.00000	Max. :0.8710	Max. :8.780	Max. :100.00	Max. :12.127	Max. :24.00	Max. :711.0	Max. :22.0	Max. :396.90	Max. :37.970	Max. :50.00	Max. :1.0000

sapply(train, function(x) sum(is.na(x)))

##      zn   indus    chas     nox      rm     age     dis     rad     tax 
##       0       0       0       0       0       0       0       0       0 
## ptratio   black   lstat    medv  target 
##       0       0       0       0       0

ntrain<-select_if(train, is.numeric)
ntrain %>%
  keep(is.numeric) %>%                     # Keep only numeric columns
  gather() %>%                             # Convert to key-value pairs
  ggplot(aes(value)) +                     # Plot the values
    facet_wrap(~ key, scales = "free") +   # In separate panels
    geom_density()

rcorr(as.matrix(train))

##            zn indus  chas   nox    rm   age   dis   rad   tax ptratio
## zn       1.00 -0.54 -0.04 -0.52  0.32 -0.57  0.66 -0.32 -0.32   -0.39
## indus   -0.54  1.00  0.06  0.76 -0.39  0.64 -0.70  0.60  0.73    0.39
## chas    -0.04  0.06  1.00  0.10  0.09  0.08 -0.10 -0.02 -0.05   -0.13
## nox     -0.52  0.76  0.10  1.00 -0.30  0.74 -0.77  0.60  0.65    0.18
## rm       0.32 -0.39  0.09 -0.30  1.00 -0.23  0.20 -0.21 -0.30   -0.36
## age     -0.57  0.64  0.08  0.74 -0.23  1.00 -0.75  0.46  0.51    0.26
## dis      0.66 -0.70 -0.10 -0.77  0.20 -0.75  1.00 -0.49 -0.53   -0.23
## rad     -0.32  0.60 -0.02  0.60 -0.21  0.46 -0.49  1.00  0.91    0.47
## tax     -0.32  0.73 -0.05  0.65 -0.30  0.51 -0.53  0.91  1.00    0.47
## ptratio -0.39  0.39 -0.13  0.18 -0.36  0.26 -0.23  0.47  0.47    1.00
## black    0.18 -0.36  0.04 -0.38  0.13 -0.27  0.29 -0.45 -0.44   -0.18
## lstat   -0.43  0.61 -0.05  0.60 -0.63  0.61 -0.51  0.50  0.56    0.38
## medv     0.38 -0.50  0.16 -0.43  0.71 -0.38  0.26 -0.40 -0.49   -0.52
## target  -0.43  0.60  0.08  0.73 -0.15  0.63 -0.62  0.63  0.61    0.25
##         black lstat  medv target
## zn       0.18 -0.43  0.38  -0.43
## indus   -0.36  0.61 -0.50   0.60
## chas     0.04 -0.05  0.16   0.08
## nox     -0.38  0.60 -0.43   0.73
## rm       0.13 -0.63  0.71  -0.15
## age     -0.27  0.61 -0.38   0.63
## dis      0.29 -0.51  0.26  -0.62
## rad     -0.45  0.50 -0.40   0.63
## tax     -0.44  0.56 -0.49   0.61
## ptratio -0.18  0.38 -0.52   0.25
## black    1.00 -0.35  0.33  -0.35
## lstat   -0.35  1.00 -0.74   0.47
## medv     0.33 -0.74  1.00  -0.27
## target  -0.35  0.47 -0.27   1.00
## 
## n= 466 
## 
## 
## P
##         zn     indus  chas   nox    rm     age    dis    rad    tax   
## zn             0.0000 0.3870 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## indus   0.0000        0.1874 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## chas    0.3870 0.1874        0.0355 0.0509 0.0890 0.0372 0.7321 0.3138
## nox     0.0000 0.0000 0.0355        0.0000 0.0000 0.0000 0.0000 0.0000
## rm      0.0000 0.0000 0.0509 0.0000        0.0000 0.0000 0.0000 0.0000
## age     0.0000 0.0000 0.0890 0.0000 0.0000        0.0000 0.0000 0.0000
## dis     0.0000 0.0000 0.0372 0.0000 0.0000 0.0000        0.0000 0.0000
## rad     0.0000 0.0000 0.7321 0.0000 0.0000 0.0000 0.0000        0.0000
## tax     0.0000 0.0000 0.3138 0.0000 0.0000 0.0000 0.0000 0.0000       
## ptratio 0.0000 0.0000 0.0054 0.0001 0.0000 0.0000 0.0000 0.0000 0.0000
## black   0.0000 0.0000 0.3384 0.0000 0.0041 0.0000 0.0000 0.0000 0.0000
## lstat   0.0000 0.0000 0.2679 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## medv    0.0000 0.0000 0.0005 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## target  0.0000 0.0000 0.0843 0.0000 0.0010 0.0000 0.0000 0.0000 0.0000
##         ptratio black  lstat  medv   target
## zn      0.0000  0.0000 0.0000 0.0000 0.0000
## indus   0.0000  0.0000 0.0000 0.0000 0.0000
## chas    0.0054  0.3384 0.2679 0.0005 0.0843
## nox     0.0001  0.0000 0.0000 0.0000 0.0000
## rm      0.0000  0.0041 0.0000 0.0000 0.0010
## age     0.0000  0.0000 0.0000 0.0000 0.0000
## dis     0.0000  0.0000 0.0000 0.0000 0.0000
## rad     0.0000  0.0000 0.0000 0.0000 0.0000
## tax     0.0000  0.0000 0.0000 0.0000 0.0000
## ptratio         0.0000 0.0000 0.0000 0.0000
## black   0.0000         0.0000 0.0000 0.0000
## lstat   0.0000  0.0000        0.0000 0.0000
## medv    0.0000  0.0000 0.0000        0.0000
## target  0.0000  0.0000 0.0000 0.0000

corrplot(cor(train), method="square")

# correlation test 1
cor.test(train$rad,train$tax,method="pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  train$rad and train$tax
## t = 46.239, df = 464, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8888115 0.9214292
## sample estimates:
##       cor 
## 0.9064632

#significant ignore

Data Preparation

# transform data using log for skewed

train$zn <- log(train$zn+1)
train$black <- log(train$black+1)
train$chas <- log(train$chas+1)


#remove rad per correlation in prior section

train <- train[, !(colnames(train) %in% c("rad"))]

#create variable
train$new <- train$tax / (train$medv*10)

rcorr(as.matrix(train))

##            zn indus  chas   nox    rm   age   dis   tax ptratio black
## zn       1.00 -0.60 -0.04 -0.55  0.34 -0.58  0.69 -0.40   -0.46  0.17
## indus   -0.60  1.00  0.06  0.76 -0.39  0.64 -0.70  0.73    0.39 -0.28
## chas    -0.04  0.06  1.00  0.10  0.09  0.08 -0.10 -0.05   -0.13  0.05
## nox     -0.55  0.76  0.10  1.00 -0.30  0.74 -0.77  0.65    0.18 -0.29
## rm       0.34 -0.39  0.09 -0.30  1.00 -0.23  0.20 -0.30   -0.36  0.09
## age     -0.58  0.64  0.08  0.74 -0.23  1.00 -0.75  0.51    0.26 -0.21
## dis      0.69 -0.70 -0.10 -0.77  0.20 -0.75  1.00 -0.53   -0.23  0.23
## tax     -0.40  0.73 -0.05  0.65 -0.30  0.51 -0.53  1.00    0.47 -0.38
## ptratio -0.46  0.39 -0.13  0.18 -0.36  0.26 -0.23  0.47    1.00 -0.18
## black    0.17 -0.28  0.05 -0.29  0.09 -0.21  0.23 -0.38   -0.18  1.00
## lstat   -0.46  0.61 -0.05  0.60 -0.63  0.61 -0.51  0.56    0.38 -0.28
## medv     0.40 -0.50  0.16 -0.43  0.71 -0.38  0.26 -0.49   -0.52  0.29
## target  -0.47  0.60  0.08  0.73 -0.15  0.63 -0.62  0.61    0.25 -0.28
## new     -0.37  0.62 -0.11  0.59 -0.39  0.49 -0.48  0.80    0.45 -0.41
##         lstat  medv target   new
## zn      -0.46  0.40  -0.47 -0.37
## indus    0.61 -0.50   0.60  0.62
## chas    -0.05  0.16   0.08 -0.11
## nox      0.60 -0.43   0.73  0.59
## rm      -0.63  0.71  -0.15 -0.39
## age      0.61 -0.38   0.63  0.49
## dis     -0.51  0.26  -0.62 -0.48
## tax      0.56 -0.49   0.61  0.80
## ptratio  0.38 -0.52   0.25  0.45
## black   -0.28  0.29  -0.28 -0.41
## lstat    1.00 -0.74   0.47  0.72
## medv    -0.74  1.00  -0.27 -0.69
## target   0.47 -0.27   1.00  0.49
## new      0.72 -0.69   0.49  1.00
## 
## n= 466 
## 
## 
## P
##         zn     indus  chas   nox    rm     age    dis    tax    ptratio
## zn             0.0000 0.4059 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
## indus   0.0000        0.1874 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
## chas    0.4059 0.1874        0.0355 0.0509 0.0890 0.0372 0.3138 0.0054 
## nox     0.0000 0.0000 0.0355        0.0000 0.0000 0.0000 0.0000 0.0001 
## rm      0.0000 0.0000 0.0509 0.0000        0.0000 0.0000 0.0000 0.0000 
## age     0.0000 0.0000 0.0890 0.0000 0.0000        0.0000 0.0000 0.0000 
## dis     0.0000 0.0000 0.0372 0.0000 0.0000 0.0000        0.0000 0.0000 
## tax     0.0000 0.0000 0.3138 0.0000 0.0000 0.0000 0.0000        0.0000 
## ptratio 0.0000 0.0000 0.0054 0.0001 0.0000 0.0000 0.0000 0.0000        
## black   0.0002 0.0000 0.2570 0.0000 0.0642 0.0000 0.0000 0.0000 0.0000 
## lstat   0.0000 0.0000 0.2679 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
## medv    0.0000 0.0000 0.0005 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
## target  0.0000 0.0000 0.0843 0.0000 0.0010 0.0000 0.0000 0.0000 0.0000 
## new     0.0000 0.0000 0.0223 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
##         black  lstat  medv   target new   
## zn      0.0002 0.0000 0.0000 0.0000 0.0000
## indus   0.0000 0.0000 0.0000 0.0000 0.0000
## chas    0.2570 0.2679 0.0005 0.0843 0.0223
## nox     0.0000 0.0000 0.0000 0.0000 0.0000
## rm      0.0642 0.0000 0.0000 0.0010 0.0000
## age     0.0000 0.0000 0.0000 0.0000 0.0000
## dis     0.0000 0.0000 0.0000 0.0000 0.0000
## tax     0.0000 0.0000 0.0000 0.0000 0.0000
## ptratio 0.0000 0.0000 0.0000 0.0000 0.0000
## black          0.0000 0.0000 0.0000 0.0000
## lstat   0.0000        0.0000 0.0000 0.0000
## medv    0.0000 0.0000        0.0000 0.0000
## target  0.0000 0.0000 0.0000        0.0000
## new     0.0000 0.0000 0.0000 0.0000

corrplot(cor(train), method="square")

Build Models

#MODEL 1
logit <- glm(target ~ ., data=train, family = "binomial" (link="logit"))
summary(logit)

## 
## Call:
## glm(formula = target ~ ., family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1097  -0.2702  -0.0188   0.1965   3.5752  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -33.029280   8.409645  -3.928 8.58e-05 ***
## zn           -0.681913   0.248420  -2.745 0.006051 ** 
## indus        -0.152009   0.049530  -3.069 0.002148 ** 
## chas          2.500407   0.970167   2.577 0.009958 ** 
## nox          51.588709   7.272073   7.094 1.30e-12 ***
## rm           -0.093566   0.620763  -0.151 0.880190    
## age           0.024907   0.012537   1.987 0.046958 *  
## dis           0.922461   0.220580   4.182 2.89e-05 ***
## tax           0.008238   0.002404   3.427 0.000611 ***
## ptratio       0.298144   0.113540   2.626 0.008642 ** 
## black        -1.785172   1.031215  -1.731 0.083428 .  
## lstat         0.087574   0.052940   1.654 0.098088 .  
## medv          0.181463   0.061169   2.967 0.003011 ** 
## new          -0.425892   0.194726  -2.187 0.028732 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 223.59  on 452  degrees of freedom
## AIC: 251.59
## 
## Number of Fisher Scoring iterations: 8

exp(logit$coefficients)

##  (Intercept)           zn        indus         chas          nox 
## 4.524451e-15 5.056487e-01 8.589803e-01 1.218746e+01 2.539168e+22 
##           rm          age          dis          tax      ptratio 
## 9.106778e-01 1.025219e+00 2.515473e+00 1.008272e+00 1.347356e+00 
##        black        lstat         medv          new 
## 1.677682e-01 1.091523e+00 1.198970e+00 6.531872e-01

logitscalar <- mean(dlogis(predict(logit, type = "link")))
logitscalar * coef(logit)

##   (Intercept)            zn         indus          chas           nox 
## -2.5169305639 -0.0519638271 -0.0115835637  0.1905385638  3.9312148611 
##            rm           age           dis           tax       ptratio 
## -0.0071300236  0.0018979633  0.0702942880  0.0006277545  0.0227194938 
##         black         lstat          medv           new 
## -0.1360354839  0.0066733801  0.0138280224 -0.0324542205

confint.default(logit)

##                     2.5 %       97.5 %
## (Intercept) -4.951188e+01 -16.54667903
## zn          -1.168807e+00  -0.19501926
## indus       -2.490869e-01  -0.05493168
## chas         5.989156e-01   4.40189905
## nox          3.733571e+01  65.84171053
## rm          -1.310238e+00   1.12310610
## age          3.349061e-04   0.04947844
## dis          4.901328e-01   1.35478874
## tax          3.526349e-03   0.01294950
## ptratio      7.560983e-02   0.52067879
## black       -3.806316e+00   0.23597177
## lstat       -1.618741e-02   0.19133483
## medv         6.157454e-02   0.30135134
## new         -8.075467e-01  -0.04423648

predlogit <- predict(logit, type="response")
train2$pred1 <- predict(logit, type="response")
summary(predlogit)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0001109 0.0309268 0.4822112 0.4914163 0.9714183 1.0000000

table(true = train$target, pred = round(fitted(logit)))

##     pred
## true   0   1
##    0 213  24
##    1  22 207

#plots for Model 1
ggplot(data = train, aes(x = age, y = target)) + 
    geom_point(color = 'grey50') + 
    geom_point(data = train2, aes(x = age, y = pred1), color = 'red', size = 0.3) + 
    theme_bw()

data.frame(train2$pred1) %>%
    ggplot(aes(x = train2.pred1)) + 
    geom_histogram(bins = 50, fill = 'grey50') +
    labs(title = 'Histogram of Predictions') +
    theme_bw()

plot.roc(train$target, train2$pred1)

#extract variables that are significant and rerun model
sigvars <- data.frame(summary(logit)$coef[summary(logit)$coef[,4] <= .05, 4])
sigvars <- add_rownames(sigvars, "vars")

## Warning: Deprecated, use tibble::rownames_to_column() instead.

colist<-dplyr::pull(sigvars, vars)
colist<-colist[2:11]

idx <- match(colist, names(train))
trainmod2 <- cbind(train[,idx], train2['target'])

#MODEL 2
logit2 <- glm(target ~ ., data=trainmod2, family = "binomial" (link="logit"))
summary(logit2)

## 
## Call:
## glm(formula = target ~ ., family = binomial(link = "logit"), 
##     data = trainmod2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3341  -0.3000  -0.0271   0.2216   3.5120  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -40.559519   5.624242  -7.212 5.53e-13 ***
## zn           -0.570579   0.227004  -2.514 0.011953 *  
## indus        -0.141446   0.049062  -2.883 0.003939 ** 
## chas          2.791745   0.970675   2.876 0.004026 ** 
## nox          48.903546   6.885627   7.102 1.23e-12 ***
## age           0.027388   0.010057   2.723 0.006465 ** 
## dis           0.804755   0.203539   3.954 7.69e-05 ***
## tax           0.008311   0.002432   3.418 0.000632 ***
## ptratio       0.289929   0.107363   2.700 0.006925 ** 
## medv          0.130716   0.033228   3.934 8.36e-05 ***
## new          -0.280439   0.208775  -1.343 0.179187    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 234.26  on 455  degrees of freedom
## AIC: 256.26
## 
## Number of Fisher Scoring iterations: 7

exp(logit2$coefficients)

##  (Intercept)           zn        indus         chas          nox 
## 2.427866e-18 5.651983e-01 8.681022e-01 1.630945e+01 1.731970e+21 
##          age          dis          tax      ptratio         medv 
## 1.027766e+00 2.236148e+00 1.008346e+00 1.336332e+00 1.139644e+00 
##          new 
## 7.554517e-01

logit2scalar <- mean(dlogis(predict(logit2, type = "link")))
logit2scalar * coef(logit2)

##   (Intercept)            zn         indus          chas           nox 
## -3.2356595453 -0.0455182453 -0.0112839288  0.2227131005  3.9013092282 
##           age           dis           tax       ptratio          medv 
##  0.0021848724  0.0641997863  0.0006630314  0.0231292172  0.0104279150 
##           new 
## -0.0223722213

predlogit2 <- predict(logit2, type="response")
train2$pred2 <- predict(logit2, type="response")

summary(predlogit2)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0002271 0.0330183 0.4686400 0.4914163 0.9662585 0.9999995

table(true = train$target, pred = round(fitted(logit2)))

##     pred
## true   0   1
##    0 214  23
##    1  25 204

#plots for Model 2
ggplot(data = train, aes(x = age, y = target)) + 
    geom_point(color = 'grey50') + 
    geom_point(data = train2, aes(x = age, y = pred2), color = 'red', size = 0.3) + 
    theme_bw()

data.frame(train2$pred2) %>%
    ggplot(aes(x = train2.pred2)) + 
    geom_histogram(bins = 50, fill = 'grey50') +
    labs(title = 'Histogram of Predictions') +
    theme_bw()

plot.roc(train$target, train2$pred2)

#MODEL 3
#PC Model no racial bias
logit3<-model <- glm(target ~ zn + indus + nox + rm + age + dis + tax + ptratio + medv + new, data=train, family = "binomial" (link="logit"))
summary(logit3)

## 
## Call:
## glm(formula = target ~ zn + indus + nox + rm + age + dis + tax + 
##     ptratio + medv + new, family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3623  -0.3016  -0.0318   0.2470   3.4208  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -35.640886   5.181413  -6.879 6.04e-12 ***
## zn           -0.607165   0.230874  -2.630 0.008542 ** 
## indus        -0.122350   0.045692  -2.678 0.007413 ** 
## nox          44.778082   6.398918   6.998 2.60e-12 ***
## rm           -0.332101   0.508476  -0.653 0.513673    
## age           0.030652   0.010570   2.900 0.003732 ** 
## dis           0.771242   0.203098   3.797 0.000146 ***
## tax           0.007710   0.002318   3.326 0.000880 ***
## ptratio       0.231692   0.105821   2.189 0.028563 *  
## medv          0.151192   0.054648   2.767 0.005663 ** 
## new          -0.250426   0.197261  -1.270 0.204257    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 242.17  on 455  degrees of freedom
## AIC: 264.17
## 
## Number of Fisher Scoring iterations: 7

exp(logit3$coefficients)

##  (Intercept)           zn        indus          nox           rm 
## 3.321695e-16 5.448937e-01 8.848385e-01 2.798168e+19 7.174150e-01 
##          age          dis          tax      ptratio         medv 
## 1.031126e+00 2.162451e+00 1.007740e+00 1.260731e+00 1.163220e+00 
##          new 
## 7.784692e-01

predlogit3 <- predict(logit3, type="response")
train2$pred3 <- predict(logit3, type="response")
summary(predlogit3)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0003552 0.0393321 0.4672500 0.4914163 0.9610074 0.9999973

table(true = train$target, pred = round(fitted(logit3)))

##     pred
## true   0   1
##    0 210  27
##    1  26 203

#plots for Model 2
ggplot(data = train, aes(x = age, y = target)) + 
    geom_point(color = 'grey50') + 
    geom_point(data = train2, aes(x = age, y = pred3), color = 'red', size = 0.3) + 
    theme_bw()

data.frame(train2$pred3) %>%
    ggplot(aes(x = train2.pred3)) + 
    geom_histogram(bins = 50, fill = 'grey50') +
    labs(title = 'Histogram of Predictions') +
    theme_bw()

plot.roc(train$target, train2$pred3)

logit3scalar <- mean(dlogis(predict(logit3, type = "link")))
logit3scalar * coef(logit3)

##   (Intercept)            zn         indus           nox            rm 
## -2.9404579504 -0.0500925172 -0.0100941744  3.6942983150 -0.0273991093 
##           age           dis           tax       ptratio          medv 
##  0.0025288302  0.0636293182  0.0006361103  0.0191151040  0.0124736734 
##           new 
## -0.0206607316

round(logitscalar * coef(logit),2)

## (Intercept)          zn       indus        chas         nox          rm 
##       -2.52       -0.05       -0.01        0.19        3.93       -0.01 
##         age         dis         tax     ptratio       black       lstat 
##        0.00        0.07        0.00        0.02       -0.14        0.01 
##        medv         new 
##        0.01       -0.03

round(logit2scalar * coef(logit2),2)

## (Intercept)          zn       indus        chas         nox         age 
##       -3.24       -0.05       -0.01        0.22        3.90        0.00 
##         dis         tax     ptratio        medv         new 
##        0.06        0.00        0.02        0.01       -0.02

round(logit3scalar * coef(logit3),2)

## (Intercept)          zn       indus         nox          rm         age 
##       -2.94       -0.05       -0.01        3.69       -0.03        0.00 
##         dis         tax     ptratio        medv         new 
##        0.06        0.00        0.02        0.01       -0.02

Select Models

test = read.csv(file="data/crime-evaluation-data.csv")
test2<- test
dim(test)

## [1] 40 13

# transform data using log for skewed

test$zn <- log(test$zn+1)
test$black <- log(test$black+1)
test$chas <- log(test$chas+1)


#remove rad per correlation in prior section

test <- test[, !(colnames(test) %in% c("rad"))]

#create variable
test$new <- test$tax / (test$medv*10)

idx <- match(colist, names(test))
testNorm2 <- test[,idx]

summary(testNorm2)

##        zn             indus             chas              nox        
##  Min.   :0.0000   Min.   : 1.760   Min.   :0.00000   Min.   :0.3850  
##  1st Qu.:0.0000   1st Qu.: 5.692   1st Qu.:0.00000   1st Qu.:0.4713  
##  Median :0.0000   Median : 8.915   Median :0.00000   Median :0.5380  
##  Mean   :0.6619   Mean   :11.507   Mean   :0.03466   Mean   :0.5592  
##  3rd Qu.:0.0000   3rd Qu.:18.100   3rd Qu.:0.00000   3rd Qu.:0.6258  
##  Max.   :4.5109   Max.   :25.650   Max.   :0.69315   Max.   :0.7400  
##       age              dis             tax           ptratio     
##  Min.   :  6.80   Min.   :1.202   Min.   :188.0   Min.   :14.70  
##  1st Qu.: 56.62   1st Qu.:2.041   1st Qu.:276.8   1st Qu.:18.40  
##  Median : 83.25   Median :3.373   Median :307.0   Median :19.60  
##  Mean   : 70.99   Mean   :3.787   Mean   :393.5   Mean   :19.12  
##  3rd Qu.: 93.10   3rd Qu.:4.527   3rd Qu.:666.0   3rd Qu.:20.20  
##  Max.   :100.00   Max.   :9.089   Max.   :666.0   Max.   :21.20  
##       medv            new        
##  Min.   : 8.40   Min.   :0.6356  
##  1st Qu.:16.98   1st Qu.:1.1427  
##  Median :20.55   Median :1.3538  
##  Mean   :21.88   Mean   :2.2218  
##  3rd Qu.:25.00   3rd Qu.:2.5804  
##  Max.   :50.00   Max.   :7.9286

modelfinal <- predict(logit2, newdata = testNorm2, type="response")

y_pred_num <- ifelse(modelfinal > 0.5, 1, 0)
y_pred <- factor(y_pred_num, levels=c(0, 1))
summary(y_pred)

##  0  1 
## 21 19

rbind(round(summary(predlogit2),4), round(summary(modelfinal),4)) %>% kable()

Min.	1st Qu.	Median	Mean	3rd Qu.	Max.
0.0002	0.0330	0.4686	0.4914	0.9663	1
0.0011	0.0515	0.4513	0.4887	0.9067	1