Data Exploration
knitr::opts_chunk$set(echo = TRUE)
library(e1071)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
library(tidyr)
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 3.4.4
library(VIF)
library(knitr)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.4.4
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.4.4
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following object is masked from 'package:e1071':
##
## impute
## The following objects are masked from 'package:base':
##
## format.pval, units
library(pROC)
## Warning: package 'pROC' was built under R version 3.4.4
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# read data
train = read.csv(file="data/crime-training-data.csv")
train2<-train
dim(train)
## [1] 466 14
#check data
summary(train) %>% kable() %>% kable_styling()
|
zn </th>
|
indus </th>
|
chas </th>
|
nox </th>
|
rm </th>
|
age </th>
|
dis </th>
|
rad </th>
|
tax </th>
|
ptratio </th>
|
black </th>
|
lstat </th>
|
medv </th>
|
target </th>
|
|
Min. : 0.00
|
Min. : 0.460
|
Min. :0.00000
|
Min. :0.3890
|
Min. :3.863
|
Min. : 2.90
|
Min. : 1.130
|
Min. : 1.00
|
Min. :187.0
|
Min. :12.6
|
Min. : 0.32
|
Min. : 1.730
|
Min. : 5.00
|
Min. :0.0000
|
|
1st Qu.: 0.00
|
1st Qu.: 5.145
|
1st Qu.:0.00000
|
1st Qu.:0.4480
|
1st Qu.:5.887
|
1st Qu.: 43.88
|
1st Qu.: 2.101
|
1st Qu.: 4.00
|
1st Qu.:281.0
|
1st Qu.:16.9
|
1st Qu.:375.61
|
1st Qu.: 7.043
|
1st Qu.:17.02
|
1st Qu.:0.0000
|
|
Median : 0.00
|
Median : 9.690
|
Median :0.00000
|
Median :0.5380
|
Median :6.210
|
Median : 77.15
|
Median : 3.191
|
Median : 5.00
|
Median :334.5
|
Median :18.9
|
Median :391.34
|
Median :11.350
|
Median :21.20
|
Median :0.0000
|
|
Mean : 11.58
|
Mean :11.105
|
Mean :0.07082
|
Mean :0.5543
|
Mean :6.291
|
Mean : 68.37
|
Mean : 3.796
|
Mean : 9.53
|
Mean :409.5
|
Mean :18.4
|
Mean :357.12
|
Mean :12.631
|
Mean :22.59
|
Mean :0.4914
|
|
3rd Qu.: 16.25
|
3rd Qu.:18.100
|
3rd Qu.:0.00000
|
3rd Qu.:0.6240
|
3rd Qu.:6.630
|
3rd Qu.: 94.10
|
3rd Qu.: 5.215
|
3rd Qu.:24.00
|
3rd Qu.:666.0
|
3rd Qu.:20.2
|
3rd Qu.:396.24
|
3rd Qu.:16.930
|
3rd Qu.:25.00
|
3rd Qu.:1.0000
|
|
Max. :100.00
|
Max. :27.740
|
Max. :1.00000
|
Max. :0.8710
|
Max. :8.780
|
Max. :100.00
|
Max. :12.127
|
Max. :24.00
|
Max. :711.0
|
Max. :22.0
|
Max. :396.90
|
Max. :37.970
|
Max. :50.00
|
Max. :1.0000
|
sapply(train, function(x) sum(is.na(x)))
## zn indus chas nox rm age dis rad tax
## 0 0 0 0 0 0 0 0 0
## ptratio black lstat medv target
## 0 0 0 0 0
ntrain<-select_if(train, is.numeric)
ntrain %>%
keep(is.numeric) %>% # Keep only numeric columns
gather() %>% # Convert to key-value pairs
ggplot(aes(value)) + # Plot the values
facet_wrap(~ key, scales = "free") + # In separate panels
geom_density()

rcorr(as.matrix(train))
## zn indus chas nox rm age dis rad tax ptratio
## zn 1.00 -0.54 -0.04 -0.52 0.32 -0.57 0.66 -0.32 -0.32 -0.39
## indus -0.54 1.00 0.06 0.76 -0.39 0.64 -0.70 0.60 0.73 0.39
## chas -0.04 0.06 1.00 0.10 0.09 0.08 -0.10 -0.02 -0.05 -0.13
## nox -0.52 0.76 0.10 1.00 -0.30 0.74 -0.77 0.60 0.65 0.18
## rm 0.32 -0.39 0.09 -0.30 1.00 -0.23 0.20 -0.21 -0.30 -0.36
## age -0.57 0.64 0.08 0.74 -0.23 1.00 -0.75 0.46 0.51 0.26
## dis 0.66 -0.70 -0.10 -0.77 0.20 -0.75 1.00 -0.49 -0.53 -0.23
## rad -0.32 0.60 -0.02 0.60 -0.21 0.46 -0.49 1.00 0.91 0.47
## tax -0.32 0.73 -0.05 0.65 -0.30 0.51 -0.53 0.91 1.00 0.47
## ptratio -0.39 0.39 -0.13 0.18 -0.36 0.26 -0.23 0.47 0.47 1.00
## black 0.18 -0.36 0.04 -0.38 0.13 -0.27 0.29 -0.45 -0.44 -0.18
## lstat -0.43 0.61 -0.05 0.60 -0.63 0.61 -0.51 0.50 0.56 0.38
## medv 0.38 -0.50 0.16 -0.43 0.71 -0.38 0.26 -0.40 -0.49 -0.52
## target -0.43 0.60 0.08 0.73 -0.15 0.63 -0.62 0.63 0.61 0.25
## black lstat medv target
## zn 0.18 -0.43 0.38 -0.43
## indus -0.36 0.61 -0.50 0.60
## chas 0.04 -0.05 0.16 0.08
## nox -0.38 0.60 -0.43 0.73
## rm 0.13 -0.63 0.71 -0.15
## age -0.27 0.61 -0.38 0.63
## dis 0.29 -0.51 0.26 -0.62
## rad -0.45 0.50 -0.40 0.63
## tax -0.44 0.56 -0.49 0.61
## ptratio -0.18 0.38 -0.52 0.25
## black 1.00 -0.35 0.33 -0.35
## lstat -0.35 1.00 -0.74 0.47
## medv 0.33 -0.74 1.00 -0.27
## target -0.35 0.47 -0.27 1.00
##
## n= 466
##
##
## P
## zn indus chas nox rm age dis rad tax
## zn 0.0000 0.3870 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## indus 0.0000 0.1874 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## chas 0.3870 0.1874 0.0355 0.0509 0.0890 0.0372 0.7321 0.3138
## nox 0.0000 0.0000 0.0355 0.0000 0.0000 0.0000 0.0000 0.0000
## rm 0.0000 0.0000 0.0509 0.0000 0.0000 0.0000 0.0000 0.0000
## age 0.0000 0.0000 0.0890 0.0000 0.0000 0.0000 0.0000 0.0000
## dis 0.0000 0.0000 0.0372 0.0000 0.0000 0.0000 0.0000 0.0000
## rad 0.0000 0.0000 0.7321 0.0000 0.0000 0.0000 0.0000 0.0000
## tax 0.0000 0.0000 0.3138 0.0000 0.0000 0.0000 0.0000 0.0000
## ptratio 0.0000 0.0000 0.0054 0.0001 0.0000 0.0000 0.0000 0.0000 0.0000
## black 0.0000 0.0000 0.3384 0.0000 0.0041 0.0000 0.0000 0.0000 0.0000
## lstat 0.0000 0.0000 0.2679 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## medv 0.0000 0.0000 0.0005 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## target 0.0000 0.0000 0.0843 0.0000 0.0010 0.0000 0.0000 0.0000 0.0000
## ptratio black lstat medv target
## zn 0.0000 0.0000 0.0000 0.0000 0.0000
## indus 0.0000 0.0000 0.0000 0.0000 0.0000
## chas 0.0054 0.3384 0.2679 0.0005 0.0843
## nox 0.0001 0.0000 0.0000 0.0000 0.0000
## rm 0.0000 0.0041 0.0000 0.0000 0.0010
## age 0.0000 0.0000 0.0000 0.0000 0.0000
## dis 0.0000 0.0000 0.0000 0.0000 0.0000
## rad 0.0000 0.0000 0.0000 0.0000 0.0000
## tax 0.0000 0.0000 0.0000 0.0000 0.0000
## ptratio 0.0000 0.0000 0.0000 0.0000
## black 0.0000 0.0000 0.0000 0.0000
## lstat 0.0000 0.0000 0.0000 0.0000
## medv 0.0000 0.0000 0.0000 0.0000
## target 0.0000 0.0000 0.0000 0.0000
corrplot(cor(train), method="square")

# correlation test 1
cor.test(train$rad,train$tax,method="pearson")
##
## Pearson's product-moment correlation
##
## data: train$rad and train$tax
## t = 46.239, df = 464, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8888115 0.9214292
## sample estimates:
## cor
## 0.9064632
#significant ignore
Build Models
#MODEL 1
logit <- glm(target ~ ., data=train, family = "binomial" (link="logit"))
summary(logit)
##
## Call:
## glm(formula = target ~ ., family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1097 -0.2702 -0.0188 0.1965 3.5752
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -33.029280 8.409645 -3.928 8.58e-05 ***
## zn -0.681913 0.248420 -2.745 0.006051 **
## indus -0.152009 0.049530 -3.069 0.002148 **
## chas 2.500407 0.970167 2.577 0.009958 **
## nox 51.588709 7.272073 7.094 1.30e-12 ***
## rm -0.093566 0.620763 -0.151 0.880190
## age 0.024907 0.012537 1.987 0.046958 *
## dis 0.922461 0.220580 4.182 2.89e-05 ***
## tax 0.008238 0.002404 3.427 0.000611 ***
## ptratio 0.298144 0.113540 2.626 0.008642 **
## black -1.785172 1.031215 -1.731 0.083428 .
## lstat 0.087574 0.052940 1.654 0.098088 .
## medv 0.181463 0.061169 2.967 0.003011 **
## new -0.425892 0.194726 -2.187 0.028732 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 223.59 on 452 degrees of freedom
## AIC: 251.59
##
## Number of Fisher Scoring iterations: 8
exp(logit$coefficients)
## (Intercept) zn indus chas nox
## 4.524451e-15 5.056487e-01 8.589803e-01 1.218746e+01 2.539168e+22
## rm age dis tax ptratio
## 9.106778e-01 1.025219e+00 2.515473e+00 1.008272e+00 1.347356e+00
## black lstat medv new
## 1.677682e-01 1.091523e+00 1.198970e+00 6.531872e-01
logitscalar <- mean(dlogis(predict(logit, type = "link")))
logitscalar * coef(logit)
## (Intercept) zn indus chas nox
## -2.5169305639 -0.0519638271 -0.0115835637 0.1905385638 3.9312148611
## rm age dis tax ptratio
## -0.0071300236 0.0018979633 0.0702942880 0.0006277545 0.0227194938
## black lstat medv new
## -0.1360354839 0.0066733801 0.0138280224 -0.0324542205
confint.default(logit)
## 2.5 % 97.5 %
## (Intercept) -4.951188e+01 -16.54667903
## zn -1.168807e+00 -0.19501926
## indus -2.490869e-01 -0.05493168
## chas 5.989156e-01 4.40189905
## nox 3.733571e+01 65.84171053
## rm -1.310238e+00 1.12310610
## age 3.349061e-04 0.04947844
## dis 4.901328e-01 1.35478874
## tax 3.526349e-03 0.01294950
## ptratio 7.560983e-02 0.52067879
## black -3.806316e+00 0.23597177
## lstat -1.618741e-02 0.19133483
## medv 6.157454e-02 0.30135134
## new -8.075467e-01 -0.04423648
predlogit <- predict(logit, type="response")
train2$pred1 <- predict(logit, type="response")
summary(predlogit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0001109 0.0309268 0.4822112 0.4914163 0.9714183 1.0000000
table(true = train$target, pred = round(fitted(logit)))
## pred
## true 0 1
## 0 213 24
## 1 22 207
#plots for Model 1
ggplot(data = train, aes(x = age, y = target)) +
geom_point(color = 'grey50') +
geom_point(data = train2, aes(x = age, y = pred1), color = 'red', size = 0.3) +
theme_bw()

data.frame(train2$pred1) %>%
ggplot(aes(x = train2.pred1)) +
geom_histogram(bins = 50, fill = 'grey50') +
labs(title = 'Histogram of Predictions') +
theme_bw()

plot.roc(train$target, train2$pred1)

#extract variables that are significant and rerun model
sigvars <- data.frame(summary(logit)$coef[summary(logit)$coef[,4] <= .05, 4])
sigvars <- add_rownames(sigvars, "vars")
## Warning: Deprecated, use tibble::rownames_to_column() instead.
colist<-dplyr::pull(sigvars, vars)
colist<-colist[2:11]
idx <- match(colist, names(train))
trainmod2 <- cbind(train[,idx], train2['target'])
#MODEL 2
logit2 <- glm(target ~ ., data=trainmod2, family = "binomial" (link="logit"))
summary(logit2)
##
## Call:
## glm(formula = target ~ ., family = binomial(link = "logit"),
## data = trainmod2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3341 -0.3000 -0.0271 0.2216 3.5120
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -40.559519 5.624242 -7.212 5.53e-13 ***
## zn -0.570579 0.227004 -2.514 0.011953 *
## indus -0.141446 0.049062 -2.883 0.003939 **
## chas 2.791745 0.970675 2.876 0.004026 **
## nox 48.903546 6.885627 7.102 1.23e-12 ***
## age 0.027388 0.010057 2.723 0.006465 **
## dis 0.804755 0.203539 3.954 7.69e-05 ***
## tax 0.008311 0.002432 3.418 0.000632 ***
## ptratio 0.289929 0.107363 2.700 0.006925 **
## medv 0.130716 0.033228 3.934 8.36e-05 ***
## new -0.280439 0.208775 -1.343 0.179187
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 234.26 on 455 degrees of freedom
## AIC: 256.26
##
## Number of Fisher Scoring iterations: 7
exp(logit2$coefficients)
## (Intercept) zn indus chas nox
## 2.427866e-18 5.651983e-01 8.681022e-01 1.630945e+01 1.731970e+21
## age dis tax ptratio medv
## 1.027766e+00 2.236148e+00 1.008346e+00 1.336332e+00 1.139644e+00
## new
## 7.554517e-01
logit2scalar <- mean(dlogis(predict(logit2, type = "link")))
logit2scalar * coef(logit2)
## (Intercept) zn indus chas nox
## -3.2356595453 -0.0455182453 -0.0112839288 0.2227131005 3.9013092282
## age dis tax ptratio medv
## 0.0021848724 0.0641997863 0.0006630314 0.0231292172 0.0104279150
## new
## -0.0223722213
predlogit2 <- predict(logit2, type="response")
train2$pred2 <- predict(logit2, type="response")
summary(predlogit2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0002271 0.0330183 0.4686400 0.4914163 0.9662585 0.9999995
table(true = train$target, pred = round(fitted(logit2)))
## pred
## true 0 1
## 0 214 23
## 1 25 204
#plots for Model 2
ggplot(data = train, aes(x = age, y = target)) +
geom_point(color = 'grey50') +
geom_point(data = train2, aes(x = age, y = pred2), color = 'red', size = 0.3) +
theme_bw()

data.frame(train2$pred2) %>%
ggplot(aes(x = train2.pred2)) +
geom_histogram(bins = 50, fill = 'grey50') +
labs(title = 'Histogram of Predictions') +
theme_bw()

plot.roc(train$target, train2$pred2)

#MODEL 3
#PC Model no racial bias
logit3<-model <- glm(target ~ zn + indus + nox + rm + age + dis + tax + ptratio + medv + new, data=train, family = "binomial" (link="logit"))
summary(logit3)
##
## Call:
## glm(formula = target ~ zn + indus + nox + rm + age + dis + tax +
## ptratio + medv + new, family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3623 -0.3016 -0.0318 0.2470 3.4208
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -35.640886 5.181413 -6.879 6.04e-12 ***
## zn -0.607165 0.230874 -2.630 0.008542 **
## indus -0.122350 0.045692 -2.678 0.007413 **
## nox 44.778082 6.398918 6.998 2.60e-12 ***
## rm -0.332101 0.508476 -0.653 0.513673
## age 0.030652 0.010570 2.900 0.003732 **
## dis 0.771242 0.203098 3.797 0.000146 ***
## tax 0.007710 0.002318 3.326 0.000880 ***
## ptratio 0.231692 0.105821 2.189 0.028563 *
## medv 0.151192 0.054648 2.767 0.005663 **
## new -0.250426 0.197261 -1.270 0.204257
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 242.17 on 455 degrees of freedom
## AIC: 264.17
##
## Number of Fisher Scoring iterations: 7
exp(logit3$coefficients)
## (Intercept) zn indus nox rm
## 3.321695e-16 5.448937e-01 8.848385e-01 2.798168e+19 7.174150e-01
## age dis tax ptratio medv
## 1.031126e+00 2.162451e+00 1.007740e+00 1.260731e+00 1.163220e+00
## new
## 7.784692e-01
predlogit3 <- predict(logit3, type="response")
train2$pred3 <- predict(logit3, type="response")
summary(predlogit3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0003552 0.0393321 0.4672500 0.4914163 0.9610074 0.9999973
table(true = train$target, pred = round(fitted(logit3)))
## pred
## true 0 1
## 0 210 27
## 1 26 203
#plots for Model 2
ggplot(data = train, aes(x = age, y = target)) +
geom_point(color = 'grey50') +
geom_point(data = train2, aes(x = age, y = pred3), color = 'red', size = 0.3) +
theme_bw()

data.frame(train2$pred3) %>%
ggplot(aes(x = train2.pred3)) +
geom_histogram(bins = 50, fill = 'grey50') +
labs(title = 'Histogram of Predictions') +
theme_bw()

plot.roc(train$target, train2$pred3)

logit3scalar <- mean(dlogis(predict(logit3, type = "link")))
logit3scalar * coef(logit3)
## (Intercept) zn indus nox rm
## -2.9404579504 -0.0500925172 -0.0100941744 3.6942983150 -0.0273991093
## age dis tax ptratio medv
## 0.0025288302 0.0636293182 0.0006361103 0.0191151040 0.0124736734
## new
## -0.0206607316
round(logitscalar * coef(logit),2)
## (Intercept) zn indus chas nox rm
## -2.52 -0.05 -0.01 0.19 3.93 -0.01
## age dis tax ptratio black lstat
## 0.00 0.07 0.00 0.02 -0.14 0.01
## medv new
## 0.01 -0.03
round(logit2scalar * coef(logit2),2)
## (Intercept) zn indus chas nox age
## -3.24 -0.05 -0.01 0.22 3.90 0.00
## dis tax ptratio medv new
## 0.06 0.00 0.02 0.01 -0.02
round(logit3scalar * coef(logit3),2)
## (Intercept) zn indus nox rm age
## -2.94 -0.05 -0.01 3.69 -0.03 0.00
## dis tax ptratio medv new
## 0.06 0.00 0.02 0.01 -0.02