library(forecast)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(e1071)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
library(MASS)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 1.4.2 ✔ purrr 0.2.5
## ✔ tidyr 0.8.1 ✔ dplyr 0.7.6
## ✔ readr 1.1.1 ✔ stringr 1.3.1
## ✔ tibble 1.4.2 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks randomForest::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ cowplot::ggsave() masks ggplot2::ggsave()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::margin() masks randomForest::margin()
## ✖ dplyr::select() masks MASS::select()
library(broom)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
train_df <- as.data.frame(read.csv("https://raw.githubusercontent.com/simplymathematics/621/master/HW3/crime-training-data_modified.csv"))
test_df<- as.data.frame(read.csv("https://raw.githubusercontent.com/simplymathematics/621/master/HW3/crime-evaluation-data_modified.csv"))
str(train_df)
## 'data.frame': 466 obs. of 13 variables:
## $ zn : num 0 0 0 30 0 0 0 0 0 80 ...
## $ indus : num 19.58 19.58 18.1 4.93 2.46 ...
## $ chas : int 0 1 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.605 0.871 0.74 0.428 0.488 0.52 0.693 0.693 0.515 0.392 ...
## $ rm : num 7.93 5.4 6.49 6.39 7.16 ...
## $ age : num 96.2 100 100 7.8 92.2 71.3 100 100 38.1 19.1 ...
## $ dis : num 2.05 1.32 1.98 7.04 2.7 ...
## $ rad : int 5 5 24 6 3 5 24 24 5 1 ...
## $ tax : int 403 403 666 300 193 384 666 666 224 315 ...
## $ ptratio: num 14.7 14.7 20.2 16.6 17.8 20.9 20.2 20.2 20.2 16.4 ...
## $ lstat : num 3.7 26.82 18.85 5.19 4.82 ...
## $ medv : num 50 13.4 15.4 23.7 37.9 26.5 5 7 22.2 20.9 ...
## $ target : int 1 1 1 0 0 0 1 1 0 0 ...
train_df$target<-as.factor(train_df$target)
train_df$chas<-as.factor(train_df$chas)
colnames(train_df)
## [1] "zn" "indus" "chas" "nox" "rm" "age" "dis"
## [8] "rad" "tax" "ptratio" "lstat" "medv" "target"
Spliting data set on train and test.
set.seed(123)
training.samples <- train_df$target %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- train_df[training.samples, ]
test.data <- train_df[-training.samples, ]
Confusion matrix function.
matrix<-function(model_name){
probabilities <- model_name %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
confusionMatrix(as.factor(predicted.classes), as.factor(test.data$target), dnn = c("Prediction", "Reference"))
}
Basic Model
model_1<- step(glm(target~., data = train_df, family = 'binomial'), direction = 'backward')
## Start: AIC=218.05
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
## ptratio + lstat + medv
##
## Df Deviance AIC
## - rm 1 192.71 216.71
## - lstat 1 192.77 216.77
## - chas 1 193.53 217.53
## - indus 1 193.99 217.99
## <none> 192.05 218.05
## - tax 1 196.59 220.59
## - zn 1 196.89 220.89
## - age 1 198.73 222.73
## - medv 1 199.95 223.95
## - ptratio 1 203.32 227.32
## - dis 1 203.84 227.84
## - rad 1 233.74 257.74
## - nox 1 265.05 289.05
##
## Step: AIC=216.71
## target ~ zn + indus + chas + nox + age + dis + rad + tax + ptratio +
## lstat + medv
##
## Df Deviance AIC
## - chas 1 194.24 216.24
## - lstat 1 194.32 216.32
## - indus 1 194.58 216.58
## <none> 192.71 216.71
## - tax 1 197.59 219.59
## - zn 1 198.07 220.07
## - age 1 199.11 221.11
## - ptratio 1 203.53 225.53
## - dis 1 203.85 225.85
## - medv 1 205.35 227.35
## - rad 1 233.81 255.81
## - nox 1 265.14 287.14
##
## Step: AIC=216.24
## target ~ zn + indus + nox + age + dis + rad + tax + ptratio +
## lstat + medv
##
## Df Deviance AIC
## - indus 1 195.51 215.51
## <none> 194.24 216.24
## - lstat 1 196.33 216.33
## - zn 1 200.59 220.59
## - tax 1 200.75 220.75
## - age 1 201.00 221.00
## - ptratio 1 203.94 223.94
## - dis 1 204.83 224.83
## - medv 1 207.12 227.12
## - rad 1 241.41 261.41
## - nox 1 265.19 285.19
##
## Step: AIC=215.51
## target ~ zn + nox + age + dis + rad + tax + ptratio + lstat +
## medv
##
## Df Deviance AIC
## - lstat 1 197.32 215.32
## <none> 195.51 215.51
## - zn 1 202.05 220.05
## - age 1 202.23 220.23
## - ptratio 1 205.01 223.01
## - dis 1 205.96 223.96
## - tax 1 206.60 224.60
## - medv 1 208.13 226.13
## - rad 1 249.55 267.55
## - nox 1 270.59 288.59
##
## Step: AIC=215.32
## target ~ zn + nox + age + dis + rad + tax + ptratio + medv
##
## Df Deviance AIC
## <none> 197.32 215.32
## - zn 1 203.45 219.45
## - ptratio 1 206.27 222.27
## - age 1 207.13 223.13
## - tax 1 207.62 223.62
## - dis 1 207.64 223.64
## - medv 1 208.65 224.65
## - rad 1 250.98 266.98
## - nox 1 273.18 289.18
summary(model_1)
##
## Call:
## glm(formula = target ~ zn + nox + age + dis + rad + tax + ptratio +
## medv, family = "binomial", data = train_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8295 -0.1752 -0.0021 0.0032 3.4191
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -37.415922 6.035013 -6.200 5.65e-10 ***
## zn -0.068648 0.032019 -2.144 0.03203 *
## nox 42.807768 6.678692 6.410 1.46e-10 ***
## age 0.032950 0.010951 3.009 0.00262 **
## dis 0.654896 0.214050 3.060 0.00222 **
## rad 0.725109 0.149788 4.841 1.29e-06 ***
## tax -0.007756 0.002653 -2.924 0.00346 **
## ptratio 0.323628 0.111390 2.905 0.00367 **
## medv 0.110472 0.035445 3.117 0.00183 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 197.32 on 457 degrees of freedom
## AIC: 215.32
##
## Number of Fisher Scoring iterations: 9
matrix(model_1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 41 4
## 1 6 41
##
## Accuracy : 0.8913
## 95% CI : (0.8092, 0.9466)
## No Information Rate : 0.5109
## P-Value [Acc > NIR] : 7.782e-15
##
## Kappa : 0.7827
## Mcnemar's Test P-Value : 0.7518
##
## Sensitivity : 0.8723
## Specificity : 0.9111
## Pos Pred Value : 0.9111
## Neg Pred Value : 0.8723
## Prevalence : 0.5109
## Detection Rate : 0.4457
## Detection Prevalence : 0.4891
## Balanced Accuracy : 0.8917
##
## 'Positive' Class : 0
##
pR2(model_1)
## llh llhNull G2 McFadden r2ML
## -98.6614264 -322.9379132 448.5529737 0.6944879 0.6180861
## r2CU
## 0.8241958
Check model_1 for the following logistic regression assumptions:
Checking for a linear relationship between the logit of the outcome and each predictor variables
probabilities <- predict(model_1, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
# Select only numeric predictors
mydata <- train_df %>%
dplyr::select_if(is.numeric)
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata <- mydata %>%
mutate(logit = log(probabilities/(1-probabilities))) %>%
gather(key = "predictors", value = "predictor.value", -logit)
ggplot(mydata, aes(logit, predictor.value))+
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = "loess") +
theme_bw() +
facet_wrap(~predictors, scales = "free_y")
The relationships are not linear, model can benefit from varibles transformations.
Checking model_1 for the presence of influencial(leverage) values
plot(model_1, which = 4, id.n = 5)
# Extract model results
model.data <- augment(model_1) %>%
mutate(index = 1:n())
model.data %>% top_n(5, .cooksd)
## # A tibble: 5 x 17
## target zn nox age dis rad tax ptratio medv .fitted .se.fit
## <fct> <dbl> <dbl> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 22 0.431 8.4 8.91 7 330 19.1 42.8 -0.941 0.970
## 2 1 0 0.544 37.8 2.52 4 304 18.4 16.1 -2.96 0.706
## 3 1 22 0.431 34.9 8.06 7 330 19.1 24.3 -2.67 0.652
## 4 1 20 0.464 42.1 4.43 3 223 18.6 21.1 -5.84 0.936
## 5 1 0 0.489 9.8 3.59 4 277 18.6 23.7 -4.42 0.832
## # ... with 6 more variables: .resid <dbl>, .hat <dbl>, .sigma <dbl>,
## # .cooksd <dbl>, .std.resid <dbl>, index <int>
ggplot(model.data, aes(index, .std.resid)) +
geom_point(aes(color = target), alpha = .5) +
theme_bw()
model.data %>%
filter(abs(.std.resid) > 3)
## # A tibble: 1 x 17
## target zn nox age dis rad tax ptratio medv .fitted .se.fit
## <fct> <dbl> <dbl> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 20 0.464 42.1 4.43 3 223 18.6 21.1 -5.84 0.936
## # ... with 6 more variables: .resid <dbl>, .hat <dbl>, .sigma <dbl>,
## # .cooksd <dbl>, .std.resid <dbl>, index <int>
Eliminating the row with influential value.
train_df_clean <-train_df %>%
filter(!(nox==0.464 & age==42.1))
Checking for intercorrelations.
vif(model_1)
## zn nox age dis rad tax ptratio medv
## 1.789037 3.172660 1.701974 3.595939 1.697110 1.754274 1.865085 2.193689
There is no significant multicollinearity detected in model_1.
building a model based on a dateset with eliminated influential values
model_2<- step(glm(target~., data = train_df_clean, family = 'binomial'), direction = 'backward')
## Start: AIC=204.95
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
## ptratio + lstat + medv
##
## Df Deviance AIC
## - lstat 1 179.53 203.53
## - rm 1 179.86 203.86
## - chas 1 180.40 204.40
## <none> 178.95 204.95
## - indus 1 181.26 205.26
## - tax 1 182.93 206.93
## - zn 1 186.28 210.28
## - age 1 187.56 211.56
## - medv 1 188.43 212.43
## - ptratio 1 190.95 214.95
## - dis 1 194.36 218.36
## - rad 1 221.84 245.84
## - nox 1 258.08 282.08
##
## Step: AIC=203.53
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
## ptratio + medv
##
## Df Deviance AIC
## - chas 1 181.28 203.28
## - rm 1 181.38 203.38
## <none> 179.53 203.53
## - indus 1 181.76 203.76
## - tax 1 183.26 205.26
## - zn 1 186.46 208.46
## - medv 1 189.16 211.16
## - ptratio 1 192.50 214.50
## - age 1 192.97 214.97
## - dis 1 195.50 217.50
## - rad 1 222.50 244.50
## - nox 1 259.97 281.97
##
## Step: AIC=203.28
## target ~ zn + indus + nox + rm + age + dis + rad + tax + ptratio +
## medv
##
## Df Deviance AIC
## - indus 1 182.79 202.79
## <none> 181.28 203.28
## - rm 1 183.38 203.38
## - tax 1 186.60 206.60
## - zn 1 189.44 209.44
## - medv 1 191.08 211.08
## - ptratio 1 193.09 213.09
## - age 1 195.88 215.88
## - dis 1 196.73 216.73
## - rad 1 232.03 252.03
## - nox 1 260.00 280.00
##
## Step: AIC=202.79
## target ~ zn + nox + rm + age + dis + rad + tax + ptratio + medv
##
## Df Deviance AIC
## - rm 1 184.66 202.66
## <none> 182.79 202.79
## - zn 1 191.25 209.25
## - medv 1 192.17 210.17
## - tax 1 192.67 210.67
## - ptratio 1 194.12 212.12
## - age 1 196.72 214.72
## - dis 1 197.96 215.96
## - rad 1 240.79 258.79
## - nox 1 266.03 284.03
##
## Step: AIC=202.66
## target ~ zn + nox + age + dis + rad + tax + ptratio + medv
##
## Df Deviance AIC
## <none> 184.66 202.66
## - zn 1 193.60 209.60
## - ptratio 1 194.15 210.15
## - tax 1 194.85 210.85
## - age 1 196.83 212.83
## - dis 1 198.25 214.25
## - medv 1 198.43 214.43
## - rad 1 240.96 256.96
## - nox 1 266.17 282.17
summary(model_2)
##
## Call:
## glm(formula = target ~ zn + nox + age + dis + rad + tax + ptratio +
## medv, family = "binomial", data = train_df_clean)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8555 -0.1501 -0.0006 0.0014 3.1726
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -42.128250 6.730139 -6.260 3.86e-10 ***
## zn -0.093902 0.036896 -2.545 0.010927 *
## nox 47.388109 7.340291 6.456 1.08e-10 ***
## age 0.038718 0.011711 3.306 0.000946 ***
## dis 0.807838 0.235713 3.427 0.000610 ***
## rad 0.806479 0.161555 4.992 5.98e-07 ***
## tax -0.007945 0.002739 -2.900 0.003728 **
## ptratio 0.350885 0.117755 2.980 0.002884 **
## medv 0.130814 0.038521 3.396 0.000684 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 644.45 on 464 degrees of freedom
## Residual deviance: 184.66 on 456 degrees of freedom
## AIC: 202.66
##
## Number of Fisher Scoring iterations: 9
vif(model_2)
## zn nox age dis rad tax ptratio medv
## 1.960798 3.435929 1.773456 4.001177 1.770018 1.759468 1.985662 2.336037
matrix(model_2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 41 5
## 1 6 40
##
## Accuracy : 0.8804
## 95% CI : (0.7961, 0.9388)
## No Information Rate : 0.5109
## P-Value [Acc > NIR] : 5.644e-14
##
## Kappa : 0.7609
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8723
## Specificity : 0.8889
## Pos Pred Value : 0.8913
## Neg Pred Value : 0.8696
## Prevalence : 0.5109
## Detection Rate : 0.4457
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.8806
##
## 'Positive' Class : 0
##
pR2(model_2)
## llh llhNull G2 McFadden r2ML
## -92.3291377 -322.2263367 459.7943981 0.7134650 0.6279791
## r2CU
## 0.8374100
Applying BoxCox variables transformations
model_3 = train(target ~., data = train_df_clean,
method = "glm", family = "binomial",
preProcess = c("BoxCox"),trControl = trainControl(
method = "cv", number = 10,
savePredictions = TRUE))
# variables that were affected by BoxCox
process <- preProcess(select(train_df_clean, -target),
method = c("BoxCox"))
process$method
## $BoxCox
## [1] "indus" "nox" "rm" "age" "dis" "rad" "tax"
## [8] "ptratio" "lstat" "medv"
##
## $ignore
## [1] "chas"
summary(model_3)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.99963 -0.08835 -0.00040 0.09534 3.14474
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 15.972742 41.521252 0.385 0.700468
## zn -0.033415 0.030692 -1.089 0.276283
## indus -0.036839 0.232913 -0.158 0.874324
## chas1 1.029523 0.808347 1.274 0.202800
## nox 15.506450 2.431033 6.379 1.79e-10 ***
## rm -2.733095 2.958518 -0.924 0.355587
## age 0.013945 0.004176 3.339 0.000841 ***
## dis 3.841600 0.928034 4.140 3.48e-05 ***
## rad 3.400070 0.806710 4.215 2.50e-05 ***
## tax -13.823020 22.144986 -0.624 0.532493
## ptratio 0.027592 0.007587 3.637 0.000276 ***
## lstat -0.104096 0.464218 -0.224 0.822570
## medv 2.763687 0.914536 3.022 0.002511 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 644.45 on 464 degrees of freedom
## Residual deviance: 184.24 on 452 degrees of freedom
## AIC: 210.24
##
## Number of Fisher Scoring iterations: 8
vif(model_3$finalModel)
## zn indus chas1 nox rm age dis rad
## 1.498521 2.989495 1.321736 4.017031 4.581452 2.700827 3.832244 2.253491
## tax ptratio lstat medv
## 2.955169 2.660317 3.740465 8.062733
probabilities <- model_3 %>% predict(test.data, type = "prob")
predicted.classes <- ifelse((probabilities[,2]) > 0.5, 1, 0)
confusionMatrix(as.factor(predicted.classes), as.factor(test.data$target), dnn = c("Prediction", "Reference"))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 41 4
## 1 6 41
##
## Accuracy : 0.8913
## 95% CI : (0.8092, 0.9466)
## No Information Rate : 0.5109
## P-Value [Acc > NIR] : 7.782e-15
##
## Kappa : 0.7827
## Mcnemar's Test P-Value : 0.7518
##
## Sensitivity : 0.8723
## Specificity : 0.9111
## Pos Pred Value : 0.9111
## Neg Pred Value : 0.8723
## Prevalence : 0.5109
## Detection Rate : 0.4457
## Detection Prevalence : 0.4891
## Balanced Accuracy : 0.8917
##
## 'Positive' Class : 0
##
Based on important variables.
Selecting important variables using caret package
# prepare training scheme
control <- trainControl(method="repeatedcv", number=10, repeats=3)
# train the model
model <- train(target~., data=train_df, method="glm", trControl=control)
# estimate variable importance
importance <- varImp(model, scale=FALSE)
# summarize importance
print(importance)
## glm variable importance
##
## Overall
## nox 6.1932
## rad 4.0843
## dis 3.2077
## ptratio 3.1791
## medv 2.6477
## age 2.4749
## tax 2.0887
## zn 1.9029
## indus 1.3568
## chas1 1.2054
## lstat 0.8486
## rm 0.8127
# plot importance
plot(importance)
model_4<- glm(target~., data = train_df %>% select(-lstat, -rm), family = 'binomial')
summary(model_4)
##
## Call:
## glm(formula = target ~ ., family = "binomial", data = train_df %>%
## select(-lstat, -rm))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8087 -0.1791 -0.0025 0.0041 3.4569
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -40.451410 6.412534 -6.308 2.82e-10 ***
## zn -0.063334 0.032883 -1.926 0.05410 .
## indus -0.059204 0.047258 -1.253 0.21028
## chas1 1.089782 0.773974 1.408 0.15912
## nox 47.790906 7.704999 6.203 5.55e-10 ***
## age 0.032699 0.011119 2.941 0.00327 **
## dis 0.691906 0.218023 3.174 0.00151 **
## rad 0.631833 0.155927 4.052 5.08e-05 ***
## tax -0.006063 0.002927 -2.072 0.03831 *
## ptratio 0.356879 0.113638 3.140 0.00169 **
## medv 0.111851 0.035310 3.168 0.00154 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 194.32 on 455 degrees of freedom
## AIC: 216.32
##
## Number of Fisher Scoring iterations: 9
vif(model_4)
## zn indus chas nox age dis rad tax
## 1.740018 2.636065 1.188059 3.995876 1.711978 3.537228 1.815751 2.082398
## ptratio medv
## 1.863116 2.219497
matrix(model_4)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 43 4
## 1 4 41
##
## Accuracy : 0.913
## 95% CI : (0.8358, 0.9617)
## No Information Rate : 0.5109
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.826
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9149
## Specificity : 0.9111
## Pos Pred Value : 0.9149
## Neg Pred Value : 0.9111
## Prevalence : 0.5109
## Detection Rate : 0.4674
## Detection Prevalence : 0.5109
## Balanced Accuracy : 0.9130
##
## 'Positive' Class : 0
##
pR2(model_4)
## llh llhNull G2 McFadden r2ML
## -97.1623981 -322.9379132 451.5510303 0.6991298 0.6205353
## r2CU
## 0.8274617
Based on the lowest Akaike information criterion (AIC). MASS package is used.
# Fit the model
model_5 <- glm(target ~., data = train.data, family = binomial) %>%
stepAIC(trace = FALSE)
# Summarize the final selected model
summary(model_5)
##
## Call:
## glm(formula = target ~ zn + nox + age + dis + rad + tax + ptratio +
## medv, family = binomial, data = train.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8476 -0.1617 -0.0019 0.0047 3.3501
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -35.267187 6.652404 -5.301 1.15e-07 ***
## zn -0.070513 0.036909 -1.910 0.05607 .
## nox 42.743396 7.365107 5.803 6.49e-09 ***
## age 0.031482 0.012618 2.495 0.01260 *
## dis 0.607943 0.245430 2.477 0.01325 *
## rad 0.660875 0.163688 4.037 5.40e-05 ***
## tax -0.008578 0.003269 -2.624 0.00869 **
## ptratio 0.273859 0.122210 2.241 0.02503 *
## medv 0.097398 0.039693 2.454 0.01414 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 518.38 on 373 degrees of freedom
## Residual deviance: 157.61 on 365 degrees of freedom
## AIC: 175.61
##
## Number of Fisher Scoring iterations: 9
matrix(model_5)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 41 4
## 1 6 41
##
## Accuracy : 0.8913
## 95% CI : (0.8092, 0.9466)
## No Information Rate : 0.5109
## P-Value [Acc > NIR] : 7.782e-15
##
## Kappa : 0.7827
## Mcnemar's Test P-Value : 0.7518
##
## Sensitivity : 0.8723
## Specificity : 0.9111
## Pos Pred Value : 0.9111
## Neg Pred Value : 0.8723
## Prevalence : 0.5109
## Detection Rate : 0.4457
## Detection Prevalence : 0.4891
## Balanced Accuracy : 0.8917
##
## 'Positive' Class : 0
##
pR2(model_5)
## llh llhNull G2 McFadden r2ML
## -78.8030913 -259.1889151 360.7716477 0.6959627 0.6188758
## r2CU
## 0.8252386
using SVM with a nonlinear kernel to compare with logistic regression
model_6<-train(target ~., data = train_df,
method = "svmRadial",
# preProcess = c("center", "scale"),
trControl = trainControl(method = "cv", number = 10, savePredictions = TRUE))
predicted.classes<-model_6 %>% predict(test.data)
mean(predicted.classes==test.data$target)
## [1] 0.9456522