Homework 3

Overview

In this homework assignment, you will explore, analyze and model a data set containing information on crime for various neighborhoods of a major city. Each record has a response variable indicating whether or not the crime rate is above the median crime rate (1) or not(0).

Your objective is to build a binary logistic regression model on the training data set to predict whether the neighborhood will be at risk for high crime levels. You will provide classifications and probabilities for the evaluation data set using your binary logistic regression model. You can only use the variables given to you (or variables that you derive from the variables provided). Below is a short description of the variables of interest in the data set:

zn: proportion of residential land zoned for large lots (over 25000 square feet) (predictor variable)
indus: proportion of non-retail business acres per suburb (predictor variable)
chas: a dummy var. for whether the suburb borders the Charles River (1) or not (0) (predictor variable)
nox: nitrogen oxides concentration (parts per 10 million) (predictor variable)
rm: average number of rooms per dwelling (predictor variable)
age: proportion of owner-occupied units built prior to 1940 (predictor variable)
dis: weighted mean of distances to five Boston employment centers (predictor variable)
rad: index of accessibility to radial highways (predictor variable)
tax: full-value property-tax rate per $10,000 (predictor variable)
ptratio: pupil-teacher ratio by town (predictor variable)
black: 1000(Bk - 0.63)2 where Bk is the proportion of blacks by town (predictor variable)
lstat: lower status of the population (percent) (predictor variable)
medv: median value of owner-occupied homes in $1000s (predictor variable)
target: whether the crime rate is above the median crime rate (1) or not (0) (response variable)

Data Exploration

training <- read.csv('./crime-training-data_modified.csv')
training2 <- training # for melting and box plot
evaluation <- read.csv('./crime-evaluation-data_modified.csv')

training %>% head() %>% kable()

zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	lstat	medv	target
0	19.58	0	0.605	7.929	96.2	2.0459	5	403	14.7	3.70	50.0	1
0	19.58	1	0.871	5.403	100.0	1.3216	5	403	14.7	26.82	13.4	1
0	18.10	0	0.740	6.485	100.0	1.9784	24	666	20.2	18.85	15.4	1
30	4.93	0	0.428	6.393	7.8	7.0355	6	300	16.6	5.19	23.7	0
0	2.46	0	0.488	7.155	92.2	2.7006	3	193	17.8	4.82	37.9	0
0	8.56	0	0.520	6.781	71.3	2.8561	5	384	20.9	7.67	26.5	0

# Converting to factor
var <- c("chas","target")
training[,var] <- lapply(training[,var], as.factor)
evaluation$chas <- as.factor(evaluation$chas)

Checking missing data

missmap(training, main="Missing Values") # using Amelia package

colSums(is.na(training))

##      zn   indus    chas     nox      rm     age     dis     rad     tax ptratio 
##       0       0       0       0       0       0       0       0       0       0 
##   lstat    medv  target 
##       0       0       0

# Boxplot to see distributions with target variable
melt(training2, id.vars='target') %>% mutate(target = as.factor(target)) %>% 
  ggplot(., aes(x=variable, y=value))+geom_boxplot(aes(fill=target))+facet_wrap(~variable, dir='h',scales='free')+ labs(title="BoxPlot - Predictors Data Distribution with Target Variable")

# Correlation matrix among variables
training2 %>% 
  cor(., use = "complete.obs") %>%
  corrplot(., method = "color", type = "upper", tl.col = "black", tl.cex=.8, diag = FALSE)

# Correlation table 
correlation <- training2 %>% 
  cor(., use = "complete.obs") %>%
  as.data.frame() %>%
  rownames_to_column()%>%
  gather(Variable, Correlation, -rowname) 

correlation %>%
  filter(Variable == "target") %>%
     arrange(desc(Correlation)) %>%
  kable()

rowname	Variable	Correlation
target	target	1.0000000
nox	target	0.7261062
age	target	0.6301062
rad	target	0.6281049
tax	target	0.6111133
indus	target	0.6048507
lstat	target	0.4691270
ptratio	target	0.2508489
chas	target	0.0800419
rm	target	-0.1525533
medv	target	-0.2705507
zn	target	-0.4316818
dis	target	-0.6186731

# Density plot to check normality
melt(training2, id.vars='target') %>% mutate(target = as.factor(target)) %>% 
  ggplot(., aes(x=value))+geom_density(fill='gray')+facet_wrap(~variable, scales='free')+
  labs(title="Density Plot for Normality and Skewness") + 
  theme_classic()

# Skewness and outliers
sapply(training2, skewness, function(x) skewness(x))

##          zn       indus        chas         nox          rm         age 
##  2.17681518  0.28854503  3.33548988  0.74632807  0.47932023 -0.57770755 
##         dis         rad         tax     ptratio       lstat        medv 
##  0.99889262  1.01027875  0.65931363 -0.75426808  0.90558642  1.07669198 
##      target 
##  0.03422935

Data Preparation

Data Splitting

# Data splitting into train and test datasets out of training2
set.seed(1003)
training_partition <- createDataPartition(training2$target, p=0.7, list = FALSE, times=1)
train2 <- training2[training_partition, ]
test2 <- training2[-training_partition, ]

sapply(training2, skewness, function(x) skewness(x))

##          zn       indus        chas         nox          rm         age 
##  2.17681518  0.28854503  3.33548988  0.74632807  0.47932023 -0.57770755 
##         dis         rad         tax     ptratio       lstat        medv 
##  0.99889262  1.01027875  0.65931363 -0.75426808  0.90558642  1.07669198 
##      target 
##  0.03422935

log transformation

train_log <- train2 # copy of basic model for log transformation
test_log <- test2


train_log$zn <- log10(train_log$zn + 1)
test_log$zn <- log10(test_log$zn + 1)

# Plot and check skewness
sapply(train_log, skewness, function(x) skewness(x))

##         zn      indus       chas        nox         rm        age        dis 
##  1.1954317  0.3708873  3.2567321  0.8108244  0.4843153 -0.5322325  0.9721716 
##        rad        tax    ptratio      lstat       medv     target 
##  1.1317640  0.7385414 -0.8218609  0.9700911  0.9700927  0.1036391

ggplot(melt(train_log), aes(x=value))+geom_density()+facet_wrap(~variable, scales='free') + labs(title="Log Transformation")

BoxCox Transformation

# Copy of train and test
train_boxcox <- train2
test_boxcox <- test2

# Preprocessing
preproc_value <- preProcess(train2[,-1] , c("BoxCox", "center", "scale"))

# Transformation on both train and test datasets
train_boxcox_transformed <- predict(preproc_value, train_boxcox)
test_boxcox_transformed <- predict(preproc_value, test_boxcox)

ggplot(melt(train_boxcox_transformed), aes(x=value))+geom_density()+facet_wrap(~variable, scales='free') + labs(title="BoxCox Transformation")

sapply(train_boxcox_transformed, function(x) skewness(x))

##           zn        indus         chas          nox           rm          age 
##  2.353729370 -0.120195838  3.256732134  0.068418489  0.026658478 -0.366340589 
##          dis          rad          tax      ptratio        lstat         medv 
##  0.106811243  0.402503618  0.069764777 -0.613098264 -0.002592159 -0.039697233 
##       target 
##  0.103639097

Build Models

Model 1 - Glmulti

Model by Forhad Akbar

# Copying test and train subset to unique variable name 
test_M1 <- test2
train_M1 <- train2

# Model1 using glmulti()
model1 <- glmulti(target ~ ., data = train_M1, level = 1, method="h", crit = "aic", plotty = FALSE, fitfunction = "glm", family=binomial)

## Initialization...
## TASK: Exhaustive screening of candidate set.
## Fitting...
## 
## After 50 models:
## Best model: target~1+zn+indus+nox+rm
## Crit= 222.104661461643
## Mean crit= 291.521601208484
## 
## After 100 models:
## Best model: target~1+zn+indus+chas+nox+rm+dis
## Crit= 215.710272123255
## Mean crit= 278.456063222927
## 
## After 150 models:
## Best model: target~1+indus+nox+rad
## Crit= 184.973368653889
## Mean crit= 238.257894055916
## 
## After 200 models:
## Best model: target~1+indus+nox+age+rad
## Crit= 183.172071295119
## Mean crit= 211.575417329578
## 
## After 250 models:
## Best model: target~1+zn+indus+nox+age+dis+rad
## Crit= 178.982898917757
## Mean crit= 201.7109464495
## 
## After 300 models:
## Best model: target~1+zn+indus+nox+age+dis+rad
## Crit= 178.982898917757
## Mean crit= 198.167791342044
## 
## After 350 models:
## Best model: target~1+zn+indus+nox+age+dis+rad
## Crit= 178.982898917757
## Mean crit= 197.687555940655
## 
## After 400 models:
## Best model: target~1+zn+indus+nox+age+dis+rad
## Crit= 178.982898917757
## Mean crit= 196.919412004912
## 
## After 450 models:
## Best model: target~1+zn+indus+nox+age+dis+rad
## Crit= 178.982898917757
## Mean crit= 188.747355360248
## 
## After 500 models:
## Best model: target~1+zn+nox+dis+rad+tax
## Crit= 177.895733969507
## Mean crit= 183.674491111715
## 
## After 550 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 181.779574196525
## 
## After 600 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 181.779574196525
## 
## After 650 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 181.779574196525
## 
## After 700 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 181.60873388419
## 
## After 750 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 181.365499837667
## 
## After 800 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 180.889875228704
## 
## After 850 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 180.889875228704
## 
## After 900 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 180.889875228704
## 
## After 950 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 180.550867764554
## 
## After 1000 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 179.743881972207
## 
## After 1050 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.85929440443
## 
## After 1100 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.490695235953
## 
## After 1150 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.490695235953
## 
## After 1200 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.490695235953
## 
## After 1250 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.490695235953
## 
## After 1300 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.488523831783
## 
## After 1350 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.483481028299
## 
## After 1400 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.483481028299
## 
## After 1450 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.483481028299
## 
## After 1500 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.483481028299
## 
## After 1550 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.419505295552
## 
## After 1600 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
## 
## After 1650 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
## 
## After 1700 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
## 
## After 1750 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
## 
## After 1800 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
## 
## After 1850 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
## 
## After 1900 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
## 
## After 1950 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
## 
## After 2000 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
## 
## After 2050 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.08593570903
## 
## After 2100 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.921626614104
## 
## After 2150 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.775763365878
## 
## After 2200 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.775763365878
## 
## After 2250 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.775763365878
## 
## After 2300 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.775763365878
## 
## After 2350 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.775763365878
## 
## After 2400 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.679002115378
## 
## After 2450 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.679002115378
## 
## After 2500 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.679002115378
## 
## After 2550 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.679002115378
## 
## After 2600 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.666075501208
## 
## After 2650 models:
## Best model: target~1+zn+nox+age+dis+rad+tax+medv
## Crit= 174.272096442614
## Mean crit= 177.411197040363
## 
## After 2700 models:
## Best model: target~1+zn+nox+age+dis+rad+tax+medv
## Crit= 174.272096442614
## Mean crit= 177.411197040363
## 
## After 2750 models:
## Best model: target~1+zn+nox+age+dis+rad+tax+medv
## Crit= 174.272096442614
## Mean crit= 177.411197040363
## 
## After 2800 models:
## Best model: target~1+zn+nox+age+dis+rad+tax+medv
## Crit= 174.272096442614
## Mean crit= 177.411197040363
## 
## After 2850 models:
## Best model: target~1+zn+nox+age+dis+rad+tax+medv
## Crit= 174.272096442614
## Mean crit= 177.411197040363
## 
## After 2900 models:
## Best model: target~1+zn+indus+chas+nox+age+dis+rad+ptratio+medv
## Crit= 173.14601019601
## Mean crit= 177.228777773728
## 
## After 2950 models:
## Best model: target~1+zn+indus+chas+nox+rm+age+dis+rad+ptratio+medv
## Crit= 173.033023203703
## Mean crit= 177.042586823876
## 
## After 3000 models:
## Best model: target~1+zn+indus+chas+nox+rm+age+dis+rad+ptratio+medv
## Crit= 173.033023203703
## Mean crit= 177.042586823876
## 
## After 3050 models:
## Best model: target~1+zn+indus+chas+nox+rm+age+dis+rad+ptratio+medv
## Crit= 173.033023203703
## Mean crit= 177.042586823876
## 
## After 3100 models:
## Best model: target~1+zn+indus+chas+nox+rm+age+dis+rad+ptratio+medv
## Crit= 173.033023203703
## Mean crit= 176.998313687738
## 
## After 3150 models:
## Best model: target~1+zn+indus+chas+nox+rm+age+dis+rad+ptratio+medv
## Crit= 173.033023203703
## Mean crit= 176.940134353252
## 
## After 3200 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.120818650972
## 
## After 3250 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.120818650972
## 
## After 3300 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.120818650972
## 
## After 3350 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.120818650972
## 
## After 3400 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.120818650972
## 
## After 3450 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.099328243548
## 
## After 3500 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.099328243548
## 
## After 3550 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.099328243548
## 
## After 3600 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.099328243548
## 
## After 3650 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.099328243548
## 
## After 3700 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.009122158001
## 
## After 3750 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.994719253727
## 
## After 3800 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.994719253727
## 
## After 3850 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.994719253727
## 
## After 3900 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.994719253727
## 
## After 3950 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.978528353695
## 
## After 4000 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.832558157461
## 
## After 4050 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.832558157461
## 
## After 4100 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.832558157461
## 
## After 4150 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.832558157461
## 
## After 4200 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.767636692846
## 
## After 4250 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.311420706647
## Completed.

summary(model1@objects[[1]])

## 
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7621  -0.2870  -0.0080   0.0057   3.2397  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -36.411735   6.701072  -5.434 5.52e-08 ***
## zn           -0.052983   0.034824  -1.521  0.12815    
## indus        -0.069985   0.048870  -1.432  0.15213    
## nox          44.712077   8.392153   5.328 9.94e-08 ***
## age           0.032535   0.012388   2.626  0.00863 ** 
## dis           0.749006   0.262842   2.850  0.00438 ** 
## rad           0.569547   0.159955   3.561  0.00037 ***
## tax          -0.005851   0.003072  -1.905  0.05683 .  
## ptratio       0.268150   0.129009   2.079  0.03766 *  
## medv          0.089608   0.039255   2.283  0.02245 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 452.43  on 326  degrees of freedom
## Residual deviance: 152.03  on 317  degrees of freedom
## AIC: 172.03
## 
## Number of Fisher Scoring iterations: 8

# Confirm I used correct model
# Ask about Target being factor earlier


test_M1$predictions<- predict(model1@objects[[1]], test_M1, type="response")
test_M1$predicted =  as.factor(ifelse(test_M1$predictions >= 0.5, 1, 0))

test_M1$target <- as.factor(test_M1$target)
confusionMatrix(test_M1$predicted, test_M1$target, positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 60  5
##          1  5 69
##                                          
##                Accuracy : 0.9281         
##                  95% CI : (0.8717, 0.965)
##     No Information Rate : 0.5324         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.8555         
##                                          
##  Mcnemar's Test P-Value : 1              
##                                          
##             Sensitivity : 0.9324         
##             Specificity : 0.9231         
##          Pos Pred Value : 0.9324         
##          Neg Pred Value : 0.9231         
##              Prevalence : 0.5324         
##          Detection Rate : 0.4964         
##    Detection Prevalence : 0.5324         
##       Balanced Accuracy : 0.9278         
##                                          
##        'Positive' Class : 1              
##

proc = roc(test_M1$target, test_M1$predictions)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(proc)

print(proc$auc)

## Area under the curve: 0.9838

Model 2 - Stepwise Regression and Calculated Variables

Model by Adam Gersowitz

# Copying test and train subset to unique variable name 
train_M2 <- train2
test_M2 <- test2

# Calcuated vars on test set

test_M2$target <- as.factor(test_M2$target)
test_M2$cfas <- as.factor(test_M2$chas)
test_M2$business<-test_M2$tax*(1-test_M2$indus)
test_M2$apartment<-test_M2$rm/test_M2$tax
test_M2$pollution<-test_M2$nox*test_M2$indus
test_M2$zndum <- ifelse(test_M2$zn>0,1,0)
test_M2$rmlog<-log(test_M2$rm)
test_M2$raddum <- ifelse(test_M2$rad>23,1,0)
test_M2$lstatptratio<-((1-test_M2$lstat)*test_M2$ptratio)#/test_M2$rm

# Calcuated vars on test set

train_M2$target <- as.factor(train_M2$target)
train_M2$cfas <- as.factor(train_M2$chas)
train_M2$business<-train_M2$tax*(1-train_M2$indus)
train_M2$apartment<-train_M2$rm/train_M2$tax
train_M2$pollution<-train_M2$nox*train_M2$indus
train_M2$zndum <- ifelse(train_M2$zn>0,1,0)
train_M2$rmlog<-log(train_M2$rm)
train_M2$raddum <- ifelse(train_M2$rad>23,1,0)
train_M2$lstatptratio<-((1-train_M2$lstat)*train_M2$ptratio)#/train_M2$rm

Model_2 <- glm(target~., data = train_M2, family = "binomial") %>%
  stepAIC(trace = FALSE)
summary(Model_2)

## 
## Call:
## glm(formula = target ~ nox + rm + dis + rad + tax + lstat + business + 
##     apartment + pollution + rmlog + raddum + lstatptratio, family = "binomial", 
##     data = train_M2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3124  -0.0448   0.0000   0.0001   4.6220  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.984e+02  9.493e+01   3.144 0.001668 ** 
## nox           9.004e+01  2.087e+01   4.314 1.60e-05 ***
## rm            6.034e+01  1.777e+01   3.395 0.000686 ***
## dis           4.546e-01  3.080e-01   1.476 0.139912    
## rad           8.286e-01  2.684e-01   3.088 0.002018 ** 
## tax          -3.586e-01  7.635e-02  -4.697 2.64e-06 ***
## lstat        -7.261e-01  2.727e-01  -2.663 0.007748 ** 
## business     -7.089e-03  1.862e-03  -3.808 0.000140 ***
## apartment    -4.511e+03  1.124e+03  -4.014 5.96e-05 ***
## pollution    -3.748e+00  1.135e+00  -3.303 0.000957 ***
## rmlog        -2.867e+02  1.010e+02  -2.839 0.004529 ** 
## raddum        3.732e+01  1.578e+03   0.024 0.981133    
## lstatptratio -3.801e-02  1.390e-02  -2.734 0.006256 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 452.434  on 326  degrees of freedom
## Residual deviance:  80.282  on 314  degrees of freedom
## AIC: 106.28
## 
## Number of Fisher Scoring iterations: 19

test_M2$predictions<-predict(Model_2, test_M2, type="response")
test_M2$predicted =  as.factor(ifelse(test_M2$predictions >= 0.5, 1, 0))

confusionMatrix(test_M2$predicted, test_M2$target, positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 65  4
##          1  0 70
##                                          
##                Accuracy : 0.9712         
##                  95% CI : (0.928, 0.9921)
##     No Information Rate : 0.5324         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9424         
##                                          
##  Mcnemar's Test P-Value : 0.1336         
##                                          
##             Sensitivity : 0.9459         
##             Specificity : 1.0000         
##          Pos Pred Value : 1.0000         
##          Neg Pred Value : 0.9420         
##              Prevalence : 0.5324         
##          Detection Rate : 0.5036         
##    Detection Prevalence : 0.5036         
##       Balanced Accuracy : 0.9730         
##                                          
##        'Positive' Class : 1              
##

proc = roc(test_M2$target, test_M2$predictions)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(proc)

print(proc$auc)

## Area under the curve: 0.9956

Model 3 - Lasso

Model by David Blumenstiel

# Copying test and train subset to unique variable name 
test_M3 <- test2
train_M3 <- train2

test_M3$target <- as.factor(test_M3$target)
train_M3$target <- as.factor(train_M3$target)

library(glmnet)  #Was a helpful guide: https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html

## Warning: package 'glmnet' was built under R version 3.6.3

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-1

#Data prep.  Needs to be in matrix format
#Took code from here: https://stackoverflow.com/questions/35437411/error-in-predict-glmnet-function-not-yet-implemented-method

trainx = model.matrix(~.-target,data=train_M3)     
newx = model.matrix(~.-target,data=test_M3)
#Makes a series of crossvalidated glmnet models for 100 lambda values (default)
#lamba values are constants that define coefficient shrinkage.  
glmnetmodel <- cv.glmnet(x = trainx,   #Predictor variables
                      y = train_M3[,names(train_M3) == "target"],   #Responce variable
                      family = "binomial", #Has it do logistic regression
                      nfolds = 10, #10 fold cv
                      type.measure = "class",  #uses missclassification error as loss
                      gamma = seq(0,1,0.1),  #Values to use for relaxed fit
                      relax = TRUE,#Mixes relaxed fit with regluarized fit
                      alpha = 1) #Basically a choice betwen lasso, ridge, or elasticnet regression.  Alpha = 1 is lasso.
#Predicts the probability that the target variable is 1
predictions <- predict(glmnetmodel, newx = newx, type = "response", s=glmnetmodel$lambda.min) #setting lambda.min uses the lambda value with the minimum mean cv error (picks the best model)
#Print's the coefficients the model uses
print(coef.glmnet(glmnetmodel, s = glmnetmodel$lambda.min))

## 14 x 1 sparse Matrix of class "dgCMatrix"
##                        1
## (Intercept) -33.18931637
## (Intercept)   .         
## zn           -0.03787292
## indus        -0.08199683
## chas          0.85574122
## nox          40.35258411
## rm            .         
## age           0.02436244
## dis           0.59880268
## rad           0.43866864
## tax          -0.00406803
## ptratio       0.25457861
## lstat         0.04079768
## medv          0.09205576

confusionMatrix(as.factor(ifelse(predictions >= 0.5, 1, 0)), test_M3$target, positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 61  7
##          1  4 67
##                                           
##                Accuracy : 0.9209          
##                  95% CI : (0.8628, 0.9598)
##     No Information Rate : 0.5324          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8415          
##                                           
##  Mcnemar's Test P-Value : 0.5465          
##                                           
##             Sensitivity : 0.9054          
##             Specificity : 0.9385          
##          Pos Pred Value : 0.9437          
##          Neg Pred Value : 0.8971          
##              Prevalence : 0.5324          
##          Detection Rate : 0.4820          
##    Detection Prevalence : 0.5108          
##       Balanced Accuracy : 0.9219          
##                                           
##        'Positive' Class : 1               
##

proc = roc(test_M3$target, predictions)

## Setting levels: control = 0, case = 1

## Warning in roc.default(test_M3$target, predictions): Deprecated use a matrix as
## predictor. Unexpected results may be produced, please pass a numeric vector.

## Setting direction: controls < cases

plot(proc)

print(proc$auc)

## Area under the curve: 0.9844

Model Selection

As Model 2 preformed the best …

Rerun model on entire training set

evaluation_F <- evaluation
training_F <- training

evaluation_F$cfas <- as.factor(evaluation_F$chas)
evaluation_F$business<-evaluation_F$tax*(1-evaluation_F$indus)
evaluation_F$apartment<-evaluation_F$rm/evaluation_F$tax
evaluation_F$pollution<-evaluation_F$nox*evaluation_F$indus
evaluation_F$zndum <- ifelse(evaluation_F$zn>0,1,0)
evaluation_F$rmlog<-log(evaluation_F$rm)
evaluation_F$raddum <- ifelse(evaluation_F$rad>23,1,0)
evaluation_F$lstatptratio<-((1-evaluation_F$lstat)*evaluation_F$ptratio)#/evaluation_F$rm

training_F$target <- as.factor(training_F$target)
training_F$cfas <- as.factor(training_F$chas)
training_F$business<-training_F$tax*(1-training_F$indus)
training_F$apartment<-training_F$rm/training_F$tax
training_F$pollution<-training_F$nox*training_F$indus
training_F$zndum <- ifelse(training_F$zn>0,1,0)
training_F$rmlog<-log(training_F$rm)
training_F$raddum <- ifelse(training_F$rad>23,1,0)
training_F$lstatptratio<-((1-training_F$lstat)*training_F$ptratio)#/training_F$rm


Model_F <- glm(target~., data = training_F, family = "binomial") %>%
  stepAIC(trace = FALSE)
summary(Model_F)

## 
## Call:
## glm(formula = target ~ nox + rm + rad + tax + lstat + business + 
##     apartment + pollution + rmlog + raddum + lstatptratio, family = "binomial", 
##     data = training_F)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9036  -0.0290   0.0000   0.0001   4.7310  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.641e+02  8.741e+01   3.022 0.002514 ** 
## nox           7.917e+01  1.462e+01   5.415 6.14e-08 ***
## rm            5.390e+01  1.664e+01   3.240 0.001196 ** 
## rad           9.930e-01  2.473e-01   4.016 5.92e-05 ***
## tax          -3.459e-01  6.765e-02  -5.114 3.16e-07 ***
## lstat        -8.961e-01  2.498e-01  -3.588 0.000333 ***
## business     -6.240e-03  1.423e-03  -4.384 1.16e-05 ***
## apartment    -4.506e+03  1.048e+03  -4.302 1.69e-05 ***
## pollution    -3.239e+00  8.585e-01  -3.773 0.000161 ***
## rmlog        -2.447e+02  9.322e+01  -2.625 0.008663 ** 
## raddum        3.421e+01  1.257e+03   0.027 0.978292    
## lstatptratio -5.001e-02  1.322e-02  -3.784 0.000154 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 103.06  on 454  degrees of freedom
## AIC: 127.06
## 
## Number of Fisher Scoring iterations: 19

Predict Test Set / Export results

predicted_probability <- predict(Model_F, evaluation_F, type = "response")
predicted_class <-  as.factor(ifelse(predicted_probability >= 0.5, 1, 0))
predictions <- data.frame(predicted_class,predicted_probability)
colnames(predictions) <- c("predicted_class","predicted_probability")
#write.csv(predictions, file = "predictions.csv")