Homework 3
Overview
In this homework assignment, you will explore, analyze and model a data set containing information on crime for various neighborhoods of a major city. Each record has a response variable indicating whether or not the crime rate is above the median crime rate (1) or not(0).
Your objective is to build a binary logistic regression model on the training data set to predict whether the neighborhood will be at risk for high crime levels. You will provide classifications and probabilities for the evaluation data set using your binary logistic regression model. You can only use the variables given to you (or variables that you derive from the variables provided). Below is a short description of the variables of interest in the data set:
- zn: proportion of residential land zoned for large lots (over 25000 square feet) (predictor variable)
- indus: proportion of non-retail business acres per suburb (predictor variable)
- chas: a dummy var. for whether the suburb borders the Charles River (1) or not (0) (predictor variable)
- nox: nitrogen oxides concentration (parts per 10 million) (predictor variable)
- rm: average number of rooms per dwelling (predictor variable)
- age: proportion of owner-occupied units built prior to 1940 (predictor variable)
- dis: weighted mean of distances to five Boston employment centers (predictor variable)
- rad: index of accessibility to radial highways (predictor variable)
- tax: full-value property-tax rate per $10,000 (predictor variable)
- ptratio: pupil-teacher ratio by town (predictor variable)
- black: 1000(Bk - 0.63)2 where Bk is the proportion of blacks by town (predictor variable)
- lstat: lower status of the population (percent) (predictor variable)
- medv: median value of owner-occupied homes in $1000s (predictor variable)
- target: whether the crime rate is above the median crime rate (1) or not (0) (response variable)
Data Exploration
training <- read.csv('./crime-training-data_modified.csv')
training2 <- training # for melting and box plot
evaluation <- read.csv('./crime-evaluation-data_modified.csv')
training %>% head() %>% kable()
zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | lstat | medv | target |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 19.58 | 0 | 0.605 | 7.929 | 96.2 | 2.0459 | 5 | 403 | 14.7 | 3.70 | 50.0 | 1 |
0 | 19.58 | 1 | 0.871 | 5.403 | 100.0 | 1.3216 | 5 | 403 | 14.7 | 26.82 | 13.4 | 1 |
0 | 18.10 | 0 | 0.740 | 6.485 | 100.0 | 1.9784 | 24 | 666 | 20.2 | 18.85 | 15.4 | 1 |
30 | 4.93 | 0 | 0.428 | 6.393 | 7.8 | 7.0355 | 6 | 300 | 16.6 | 5.19 | 23.7 | 0 |
0 | 2.46 | 0 | 0.488 | 7.155 | 92.2 | 2.7006 | 3 | 193 | 17.8 | 4.82 | 37.9 | 0 |
0 | 8.56 | 0 | 0.520 | 6.781 | 71.3 | 2.8561 | 5 | 384 | 20.9 | 7.67 | 26.5 | 0 |
# Converting to factor
var <- c("chas","target")
training[,var] <- lapply(training[,var], as.factor)
evaluation$chas <- as.factor(evaluation$chas)
Checking missing data
## zn indus chas nox rm age dis rad tax ptratio
## 0 0 0 0 0 0 0 0 0 0
## lstat medv target
## 0 0 0
# Boxplot to see distributions with target variable
melt(training2, id.vars='target') %>% mutate(target = as.factor(target)) %>%
ggplot(., aes(x=variable, y=value))+geom_boxplot(aes(fill=target))+facet_wrap(~variable, dir='h',scales='free')+ labs(title="BoxPlot - Predictors Data Distribution with Target Variable")
# Correlation matrix among variables
training2 %>%
cor(., use = "complete.obs") %>%
corrplot(., method = "color", type = "upper", tl.col = "black", tl.cex=.8, diag = FALSE)
# Correlation table
correlation <- training2 %>%
cor(., use = "complete.obs") %>%
as.data.frame() %>%
rownames_to_column()%>%
gather(Variable, Correlation, -rowname)
correlation %>%
filter(Variable == "target") %>%
arrange(desc(Correlation)) %>%
kable()
rowname | Variable | Correlation |
---|---|---|
target | target | 1.0000000 |
nox | target | 0.7261062 |
age | target | 0.6301062 |
rad | target | 0.6281049 |
tax | target | 0.6111133 |
indus | target | 0.6048507 |
lstat | target | 0.4691270 |
ptratio | target | 0.2508489 |
chas | target | 0.0800419 |
rm | target | -0.1525533 |
medv | target | -0.2705507 |
zn | target | -0.4316818 |
dis | target | -0.6186731 |
# Density plot to check normality
melt(training2, id.vars='target') %>% mutate(target = as.factor(target)) %>%
ggplot(., aes(x=value))+geom_density(fill='gray')+facet_wrap(~variable, scales='free')+
labs(title="Density Plot for Normality and Skewness") +
theme_classic()
## zn indus chas nox rm age
## 2.17681518 0.28854503 3.33548988 0.74632807 0.47932023 -0.57770755
## dis rad tax ptratio lstat medv
## 0.99889262 1.01027875 0.65931363 -0.75426808 0.90558642 1.07669198
## target
## 0.03422935
Data Preparation
Data Splitting
# Data splitting into train and test datasets out of training2
set.seed(1003)
training_partition <- createDataPartition(training2$target, p=0.7, list = FALSE, times=1)
train2 <- training2[training_partition, ]
test2 <- training2[-training_partition, ]
sapply(training2, skewness, function(x) skewness(x))
## zn indus chas nox rm age
## 2.17681518 0.28854503 3.33548988 0.74632807 0.47932023 -0.57770755
## dis rad tax ptratio lstat medv
## 0.99889262 1.01027875 0.65931363 -0.75426808 0.90558642 1.07669198
## target
## 0.03422935
log transformation
train_log <- train2 # copy of basic model for log transformation
test_log <- test2
train_log$zn <- log10(train_log$zn + 1)
test_log$zn <- log10(test_log$zn + 1)
# Plot and check skewness
sapply(train_log, skewness, function(x) skewness(x))
## zn indus chas nox rm age dis
## 1.1954317 0.3708873 3.2567321 0.8108244 0.4843153 -0.5322325 0.9721716
## rad tax ptratio lstat medv target
## 1.1317640 0.7385414 -0.8218609 0.9700911 0.9700927 0.1036391
BoxCox Transformation
# Copy of train and test
train_boxcox <- train2
test_boxcox <- test2
# Preprocessing
preproc_value <- preProcess(train2[,-1] , c("BoxCox", "center", "scale"))
# Transformation on both train and test datasets
train_boxcox_transformed <- predict(preproc_value, train_boxcox)
test_boxcox_transformed <- predict(preproc_value, test_boxcox)
ggplot(melt(train_boxcox_transformed), aes(x=value))+geom_density()+facet_wrap(~variable, scales='free') + labs(title="BoxCox Transformation")
## zn indus chas nox rm age
## 2.353729370 -0.120195838 3.256732134 0.068418489 0.026658478 -0.366340589
## dis rad tax ptratio lstat medv
## 0.106811243 0.402503618 0.069764777 -0.613098264 -0.002592159 -0.039697233
## target
## 0.103639097
Build Models
Model 1 - Glmulti
Model by Forhad Akbar
# Model1 using glmulti()
model1 <- glmulti(target ~ ., data = train_M1, level = 1, method="h", crit = "aic", plotty = FALSE, fitfunction = "glm", family=binomial)
## Initialization...
## TASK: Exhaustive screening of candidate set.
## Fitting...
##
## After 50 models:
## Best model: target~1+zn+indus+nox+rm
## Crit= 222.104661461643
## Mean crit= 291.521601208484
##
## After 100 models:
## Best model: target~1+zn+indus+chas+nox+rm+dis
## Crit= 215.710272123255
## Mean crit= 278.456063222927
##
## After 150 models:
## Best model: target~1+indus+nox+rad
## Crit= 184.973368653889
## Mean crit= 238.257894055916
##
## After 200 models:
## Best model: target~1+indus+nox+age+rad
## Crit= 183.172071295119
## Mean crit= 211.575417329578
##
## After 250 models:
## Best model: target~1+zn+indus+nox+age+dis+rad
## Crit= 178.982898917757
## Mean crit= 201.7109464495
##
## After 300 models:
## Best model: target~1+zn+indus+nox+age+dis+rad
## Crit= 178.982898917757
## Mean crit= 198.167791342044
##
## After 350 models:
## Best model: target~1+zn+indus+nox+age+dis+rad
## Crit= 178.982898917757
## Mean crit= 197.687555940655
##
## After 400 models:
## Best model: target~1+zn+indus+nox+age+dis+rad
## Crit= 178.982898917757
## Mean crit= 196.919412004912
##
## After 450 models:
## Best model: target~1+zn+indus+nox+age+dis+rad
## Crit= 178.982898917757
## Mean crit= 188.747355360248
##
## After 500 models:
## Best model: target~1+zn+nox+dis+rad+tax
## Crit= 177.895733969507
## Mean crit= 183.674491111715
##
## After 550 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 181.779574196525
##
## After 600 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 181.779574196525
##
## After 650 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 181.779574196525
##
## After 700 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 181.60873388419
##
## After 750 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 181.365499837667
##
## After 800 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 180.889875228704
##
## After 850 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 180.889875228704
##
## After 900 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 180.889875228704
##
## After 950 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 180.550867764554
##
## After 1000 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 179.743881972207
##
## After 1050 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.85929440443
##
## After 1100 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.490695235953
##
## After 1150 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.490695235953
##
## After 1200 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.490695235953
##
## After 1250 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.490695235953
##
## After 1300 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.488523831783
##
## After 1350 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.483481028299
##
## After 1400 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.483481028299
##
## After 1450 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.483481028299
##
## After 1500 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.483481028299
##
## After 1550 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.419505295552
##
## After 1600 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
##
## After 1650 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
##
## After 1700 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
##
## After 1750 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
##
## After 1800 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
##
## After 1850 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
##
## After 1900 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
##
## After 1950 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
##
## After 2000 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.119453784751
##
## After 2050 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 178.08593570903
##
## After 2100 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.921626614104
##
## After 2150 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.775763365878
##
## After 2200 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.775763365878
##
## After 2250 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.775763365878
##
## After 2300 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.775763365878
##
## After 2350 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.775763365878
##
## After 2400 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.679002115378
##
## After 2450 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.679002115378
##
## After 2500 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.679002115378
##
## After 2550 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.679002115378
##
## After 2600 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax
## Crit= 175.074572155548
## Mean crit= 177.666075501208
##
## After 2650 models:
## Best model: target~1+zn+nox+age+dis+rad+tax+medv
## Crit= 174.272096442614
## Mean crit= 177.411197040363
##
## After 2700 models:
## Best model: target~1+zn+nox+age+dis+rad+tax+medv
## Crit= 174.272096442614
## Mean crit= 177.411197040363
##
## After 2750 models:
## Best model: target~1+zn+nox+age+dis+rad+tax+medv
## Crit= 174.272096442614
## Mean crit= 177.411197040363
##
## After 2800 models:
## Best model: target~1+zn+nox+age+dis+rad+tax+medv
## Crit= 174.272096442614
## Mean crit= 177.411197040363
##
## After 2850 models:
## Best model: target~1+zn+nox+age+dis+rad+tax+medv
## Crit= 174.272096442614
## Mean crit= 177.411197040363
##
## After 2900 models:
## Best model: target~1+zn+indus+chas+nox+age+dis+rad+ptratio+medv
## Crit= 173.14601019601
## Mean crit= 177.228777773728
##
## After 2950 models:
## Best model: target~1+zn+indus+chas+nox+rm+age+dis+rad+ptratio+medv
## Crit= 173.033023203703
## Mean crit= 177.042586823876
##
## After 3000 models:
## Best model: target~1+zn+indus+chas+nox+rm+age+dis+rad+ptratio+medv
## Crit= 173.033023203703
## Mean crit= 177.042586823876
##
## After 3050 models:
## Best model: target~1+zn+indus+chas+nox+rm+age+dis+rad+ptratio+medv
## Crit= 173.033023203703
## Mean crit= 177.042586823876
##
## After 3100 models:
## Best model: target~1+zn+indus+chas+nox+rm+age+dis+rad+ptratio+medv
## Crit= 173.033023203703
## Mean crit= 176.998313687738
##
## After 3150 models:
## Best model: target~1+zn+indus+chas+nox+rm+age+dis+rad+ptratio+medv
## Crit= 173.033023203703
## Mean crit= 176.940134353252
##
## After 3200 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.120818650972
##
## After 3250 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.120818650972
##
## After 3300 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.120818650972
##
## After 3350 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.120818650972
##
## After 3400 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.120818650972
##
## After 3450 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.099328243548
##
## After 3500 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.099328243548
##
## After 3550 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.099328243548
##
## After 3600 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.099328243548
##
## After 3650 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.099328243548
##
## After 3700 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 176.009122158001
##
## After 3750 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.994719253727
##
## After 3800 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.994719253727
##
## After 3850 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.994719253727
##
## After 3900 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.994719253727
##
## After 3950 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.978528353695
##
## After 4000 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.832558157461
##
## After 4050 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.832558157461
##
## After 4100 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.832558157461
##
## After 4150 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.832558157461
##
## After 4200 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.767636692846
##
## After 4250 models:
## Best model: target~1+zn+indus+nox+age+dis+rad+tax+ptratio+medv
## Crit= 172.034915833386
## Mean crit= 175.311420706647
## Completed.
##
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7621 -0.2870 -0.0080 0.0057 3.2397
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -36.411735 6.701072 -5.434 5.52e-08 ***
## zn -0.052983 0.034824 -1.521 0.12815
## indus -0.069985 0.048870 -1.432 0.15213
## nox 44.712077 8.392153 5.328 9.94e-08 ***
## age 0.032535 0.012388 2.626 0.00863 **
## dis 0.749006 0.262842 2.850 0.00438 **
## rad 0.569547 0.159955 3.561 0.00037 ***
## tax -0.005851 0.003072 -1.905 0.05683 .
## ptratio 0.268150 0.129009 2.079 0.03766 *
## medv 0.089608 0.039255 2.283 0.02245 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 452.43 on 326 degrees of freedom
## Residual deviance: 152.03 on 317 degrees of freedom
## AIC: 172.03
##
## Number of Fisher Scoring iterations: 8
# Confirm I used correct model
# Ask about Target being factor earlier
test_M1$predictions<- predict(model1@objects[[1]], test_M1, type="response")
test_M1$predicted = as.factor(ifelse(test_M1$predictions >= 0.5, 1, 0))
test_M1$target <- as.factor(test_M1$target)
confusionMatrix(test_M1$predicted, test_M1$target, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 60 5
## 1 5 69
##
## Accuracy : 0.9281
## 95% CI : (0.8717, 0.965)
## No Information Rate : 0.5324
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8555
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9324
## Specificity : 0.9231
## Pos Pred Value : 0.9324
## Neg Pred Value : 0.9231
## Prevalence : 0.5324
## Detection Rate : 0.4964
## Detection Prevalence : 0.5324
## Balanced Accuracy : 0.9278
##
## 'Positive' Class : 1
##
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Area under the curve: 0.9838
Model 2 - Stepwise Regression and Calculated Variables
Model by Adam Gersowitz
# Calcuated vars on test set
test_M2$target <- as.factor(test_M2$target)
test_M2$cfas <- as.factor(test_M2$chas)
test_M2$business<-test_M2$tax*(1-test_M2$indus)
test_M2$apartment<-test_M2$rm/test_M2$tax
test_M2$pollution<-test_M2$nox*test_M2$indus
test_M2$zndum <- ifelse(test_M2$zn>0,1,0)
test_M2$rmlog<-log(test_M2$rm)
test_M2$raddum <- ifelse(test_M2$rad>23,1,0)
test_M2$lstatptratio<-((1-test_M2$lstat)*test_M2$ptratio)#/test_M2$rm
# Calcuated vars on test set
train_M2$target <- as.factor(train_M2$target)
train_M2$cfas <- as.factor(train_M2$chas)
train_M2$business<-train_M2$tax*(1-train_M2$indus)
train_M2$apartment<-train_M2$rm/train_M2$tax
train_M2$pollution<-train_M2$nox*train_M2$indus
train_M2$zndum <- ifelse(train_M2$zn>0,1,0)
train_M2$rmlog<-log(train_M2$rm)
train_M2$raddum <- ifelse(train_M2$rad>23,1,0)
train_M2$lstatptratio<-((1-train_M2$lstat)*train_M2$ptratio)#/train_M2$rm
Model_2 <- glm(target~., data = train_M2, family = "binomial") %>%
stepAIC(trace = FALSE)
summary(Model_2)
##
## Call:
## glm(formula = target ~ nox + rm + dis + rad + tax + lstat + business +
## apartment + pollution + rmlog + raddum + lstatptratio, family = "binomial",
## data = train_M2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3124 -0.0448 0.0000 0.0001 4.6220
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.984e+02 9.493e+01 3.144 0.001668 **
## nox 9.004e+01 2.087e+01 4.314 1.60e-05 ***
## rm 6.034e+01 1.777e+01 3.395 0.000686 ***
## dis 4.546e-01 3.080e-01 1.476 0.139912
## rad 8.286e-01 2.684e-01 3.088 0.002018 **
## tax -3.586e-01 7.635e-02 -4.697 2.64e-06 ***
## lstat -7.261e-01 2.727e-01 -2.663 0.007748 **
## business -7.089e-03 1.862e-03 -3.808 0.000140 ***
## apartment -4.511e+03 1.124e+03 -4.014 5.96e-05 ***
## pollution -3.748e+00 1.135e+00 -3.303 0.000957 ***
## rmlog -2.867e+02 1.010e+02 -2.839 0.004529 **
## raddum 3.732e+01 1.578e+03 0.024 0.981133
## lstatptratio -3.801e-02 1.390e-02 -2.734 0.006256 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 452.434 on 326 degrees of freedom
## Residual deviance: 80.282 on 314 degrees of freedom
## AIC: 106.28
##
## Number of Fisher Scoring iterations: 19
test_M2$predictions<-predict(Model_2, test_M2, type="response")
test_M2$predicted = as.factor(ifelse(test_M2$predictions >= 0.5, 1, 0))
confusionMatrix(test_M2$predicted, test_M2$target, positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 65 4
## 1 0 70
##
## Accuracy : 0.9712
## 95% CI : (0.928, 0.9921)
## No Information Rate : 0.5324
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9424
##
## Mcnemar's Test P-Value : 0.1336
##
## Sensitivity : 0.9459
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9420
## Prevalence : 0.5324
## Detection Rate : 0.5036
## Detection Prevalence : 0.5036
## Balanced Accuracy : 0.9730
##
## 'Positive' Class : 1
##
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Area under the curve: 0.9956
Model 3 - Lasso
Model by David Blumenstiel
# Copying test and train subset to unique variable name
test_M3 <- test2
train_M3 <- train2
test_M3$target <- as.factor(test_M3$target)
train_M3$target <- as.factor(train_M3$target)
## Warning: package 'glmnet' was built under R version 3.6.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-1
#Data prep. Needs to be in matrix format
#Took code from here: https://stackoverflow.com/questions/35437411/error-in-predict-glmnet-function-not-yet-implemented-method
trainx = model.matrix(~.-target,data=train_M3)
newx = model.matrix(~.-target,data=test_M3)
#Makes a series of crossvalidated glmnet models for 100 lambda values (default)
#lamba values are constants that define coefficient shrinkage.
glmnetmodel <- cv.glmnet(x = trainx, #Predictor variables
y = train_M3[,names(train_M3) == "target"], #Responce variable
family = "binomial", #Has it do logistic regression
nfolds = 10, #10 fold cv
type.measure = "class", #uses missclassification error as loss
gamma = seq(0,1,0.1), #Values to use for relaxed fit
relax = TRUE,#Mixes relaxed fit with regluarized fit
alpha = 1) #Basically a choice betwen lasso, ridge, or elasticnet regression. Alpha = 1 is lasso.
#Predicts the probability that the target variable is 1
predictions <- predict(glmnetmodel, newx = newx, type = "response", s=glmnetmodel$lambda.min) #setting lambda.min uses the lambda value with the minimum mean cv error (picks the best model)
#Print's the coefficients the model uses
print(coef.glmnet(glmnetmodel, s = glmnetmodel$lambda.min))
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -33.18931637
## (Intercept) .
## zn -0.03787292
## indus -0.08199683
## chas 0.85574122
## nox 40.35258411
## rm .
## age 0.02436244
## dis 0.59880268
## rad 0.43866864
## tax -0.00406803
## ptratio 0.25457861
## lstat 0.04079768
## medv 0.09205576
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 61 7
## 1 4 67
##
## Accuracy : 0.9209
## 95% CI : (0.8628, 0.9598)
## No Information Rate : 0.5324
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8415
##
## Mcnemar's Test P-Value : 0.5465
##
## Sensitivity : 0.9054
## Specificity : 0.9385
## Pos Pred Value : 0.9437
## Neg Pred Value : 0.8971
## Prevalence : 0.5324
## Detection Rate : 0.4820
## Detection Prevalence : 0.5108
## Balanced Accuracy : 0.9219
##
## 'Positive' Class : 1
##
## Setting levels: control = 0, case = 1
## Warning in roc.default(test_M3$target, predictions): Deprecated use a matrix as
## predictor. Unexpected results may be produced, please pass a numeric vector.
## Setting direction: controls < cases
## Area under the curve: 0.9844
Model Selection
As Model 2 preformed the best …
Rerun model on entire training set
evaluation_F <- evaluation
training_F <- training
evaluation_F$cfas <- as.factor(evaluation_F$chas)
evaluation_F$business<-evaluation_F$tax*(1-evaluation_F$indus)
evaluation_F$apartment<-evaluation_F$rm/evaluation_F$tax
evaluation_F$pollution<-evaluation_F$nox*evaluation_F$indus
evaluation_F$zndum <- ifelse(evaluation_F$zn>0,1,0)
evaluation_F$rmlog<-log(evaluation_F$rm)
evaluation_F$raddum <- ifelse(evaluation_F$rad>23,1,0)
evaluation_F$lstatptratio<-((1-evaluation_F$lstat)*evaluation_F$ptratio)#/evaluation_F$rm
training_F$target <- as.factor(training_F$target)
training_F$cfas <- as.factor(training_F$chas)
training_F$business<-training_F$tax*(1-training_F$indus)
training_F$apartment<-training_F$rm/training_F$tax
training_F$pollution<-training_F$nox*training_F$indus
training_F$zndum <- ifelse(training_F$zn>0,1,0)
training_F$rmlog<-log(training_F$rm)
training_F$raddum <- ifelse(training_F$rad>23,1,0)
training_F$lstatptratio<-((1-training_F$lstat)*training_F$ptratio)#/training_F$rm
Model_F <- glm(target~., data = training_F, family = "binomial") %>%
stepAIC(trace = FALSE)
summary(Model_F)
##
## Call:
## glm(formula = target ~ nox + rm + rad + tax + lstat + business +
## apartment + pollution + rmlog + raddum + lstatptratio, family = "binomial",
## data = training_F)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9036 -0.0290 0.0000 0.0001 4.7310
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.641e+02 8.741e+01 3.022 0.002514 **
## nox 7.917e+01 1.462e+01 5.415 6.14e-08 ***
## rm 5.390e+01 1.664e+01 3.240 0.001196 **
## rad 9.930e-01 2.473e-01 4.016 5.92e-05 ***
## tax -3.459e-01 6.765e-02 -5.114 3.16e-07 ***
## lstat -8.961e-01 2.498e-01 -3.588 0.000333 ***
## business -6.240e-03 1.423e-03 -4.384 1.16e-05 ***
## apartment -4.506e+03 1.048e+03 -4.302 1.69e-05 ***
## pollution -3.239e+00 8.585e-01 -3.773 0.000161 ***
## rmlog -2.447e+02 9.322e+01 -2.625 0.008663 **
## raddum 3.421e+01 1.257e+03 0.027 0.978292
## lstatptratio -5.001e-02 1.322e-02 -3.784 0.000154 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 103.06 on 454 degrees of freedom
## AIC: 127.06
##
## Number of Fisher Scoring iterations: 19
Predict Test Set / Export results
predicted_probability <- predict(Model_F, evaluation_F, type = "response")
predicted_class <- as.factor(ifelse(predicted_probability >= 0.5, 1, 0))
predictions <- data.frame(predicted_class,predicted_probability)
colnames(predictions) <- c("predicted_class","predicted_probability")
#write.csv(predictions, file = "predictions.csv")