##Collect Data

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
tr <- read_csv("https://raw.githubusercontent.com/agersowitz/DATA-621/main/HW3%20train.csv")
## Parsed with column specification:
## cols(
##   zn = col_double(),
##   indus = col_double(),
##   chas = col_double(),
##   nox = col_double(),
##   rm = col_double(),
##   age = col_double(),
##   dis = col_double(),
##   rad = col_double(),
##   tax = col_double(),
##   ptratio = col_double(),
##   lstat = col_double(),
##   medv = col_double(),
##   target = col_character()
## )
tr <- data.frame(tr)


ev <- read_csv("https://raw.githubusercontent.com/agersowitz/DATA-621/main/HW3%20Eval.csv")
## Parsed with column specification:
## cols(
##   zn = col_double(),
##   indus = col_double(),
##   chas = col_double(),
##   nox = col_double(),
##   rm = col_double(),
##   age = col_double(),
##   dis = col_double(),
##   rad = col_double(),
##   tax = col_double(),
##   ptratio = col_double(),
##   lstat = col_double(),
##   medv = col_character()
## )
ev <- data.frame(ev)

##Data Exploration

As the first step in data exploration I use the skim function form the skimr package. This shows missing data, mean, percentiles and a histogram of the distribution of all of the data fields all in one output.

We can see there are no fields with missing data so we will not have to address this issue

Next we will use the funModeling package to produce a quick correlation table with the target variable to determine if there are any noteworthy features in the model. We can see that the most highly correlated stats with the target are nox, age, rad, tax, and indus.

library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## funModeling v.1.9.4 :)
## Examples and tutorials at livebook.datascienceheroes.com
##  / Now in Spanish: librovivodecienciadedatos.ai
library(skimr)


summary(tr)
##        zn             indus             chas              nox        
##  Min.   :  0.00   Min.   : 0.460   Min.   :0.00000   Min.   :0.3890  
##  1st Qu.:  0.00   1st Qu.: 5.145   1st Qu.:0.00000   1st Qu.:0.4480  
##  Median :  0.00   Median : 9.690   Median :0.00000   Median :0.5380  
##  Mean   : 11.58   Mean   :11.105   Mean   :0.07082   Mean   :0.5543  
##  3rd Qu.: 16.25   3rd Qu.:18.100   3rd Qu.:0.00000   3rd Qu.:0.6240  
##  Max.   :100.00   Max.   :27.740   Max.   :1.00000   Max.   :0.8710  
##        rm             age              dis              rad       
##  Min.   :3.863   Min.   :  2.90   Min.   : 1.130   Min.   : 1.00  
##  1st Qu.:5.887   1st Qu.: 43.88   1st Qu.: 2.101   1st Qu.: 4.00  
##  Median :6.210   Median : 77.15   Median : 3.191   Median : 5.00  
##  Mean   :6.291   Mean   : 68.37   Mean   : 3.796   Mean   : 9.53  
##  3rd Qu.:6.630   3rd Qu.: 94.10   3rd Qu.: 5.215   3rd Qu.:24.00  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.00  
##       tax           ptratio         lstat             medv      
##  Min.   :187.0   Min.   :12.6   Min.   : 1.730   Min.   : 5.00  
##  1st Qu.:281.0   1st Qu.:16.9   1st Qu.: 7.043   1st Qu.:17.02  
##  Median :334.5   Median :18.9   Median :11.350   Median :21.20  
##  Mean   :409.5   Mean   :18.4   Mean   :12.631   Mean   :22.59  
##  3rd Qu.:666.0   3rd Qu.:20.2   3rd Qu.:16.930   3rd Qu.:25.00  
##  Max.   :711.0   Max.   :22.0   Max.   :37.970   Max.   :50.00  
##     target         
##  Length:466        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
skim(tr)
Data summary
Name tr
Number of rows 466
Number of columns 13
_______________________
Column type frequency:
character 1
numeric 12
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
target 0 1 1 2 0 3 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
zn 0 1 11.58 23.36 0.00 0.00 0.00 16.25 100.00 ▇▁▁▁▁
indus 0 1 11.11 6.85 0.46 5.15 9.69 18.10 27.74 ▇▆▁▇▁
chas 0 1 0.07 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
nox 0 1 0.55 0.12 0.39 0.45 0.54 0.62 0.87 ▇▇▅▃▁
rm 0 1 6.29 0.70 3.86 5.89 6.21 6.63 8.78 ▁▂▇▂▁
age 0 1 68.37 28.32 2.90 43.88 77.15 94.10 100.00 ▂▂▂▃▇
dis 0 1 3.80 2.11 1.13 2.10 3.19 5.21 12.13 ▇▅▂▁▁
rad 0 1 9.53 8.69 1.00 4.00 5.00 24.00 24.00 ▇▂▁▁▃
tax 0 1 409.50 167.90 187.00 281.00 334.50 666.00 711.00 ▇▇▅▁▇
ptratio 0 1 18.40 2.20 12.60 16.90 18.90 20.20 22.00 ▁▃▅▅▇
lstat 0 1 12.63 7.10 1.73 7.04 11.35 16.93 37.97 ▇▇▅▂▁
medv 0 1 22.59 9.24 5.00 17.02 21.20 25.00 50.00 ▂▇▅▁▁
correlation_table(data=tr, target="target")
## Warning in correlation_table(data = tr, target = "target"): NAs introduced by
## coercion
##    Variable target
## 1    target   1.00
## 2       nox   0.73
## 3       age   0.63
## 4       rad   0.63
## 5       tax   0.61
## 6     indus   0.60
## 7     lstat   0.47
## 8   ptratio   0.25
## 9        rm  -0.15
## 10     medv  -0.27
## 11       zn  -0.43
## 12      dis  -0.62

##Data Preparation

Since there are no null values we won’t need to worry about replacing them. We will change the datatype a few variables.

We will create a few variables based off off potential interactions form the given fields. First I will create a field called business that attempts to take the tax valuation of buildings and the proportion of non residential areas to identify suburbs with large retail centers.

The next variable we will create will be called apartment. This will attempt to take the property tax rates of homes and the average number of rooms per dwelling to identify areas with a high proportion of large homes.

We will create a variable called Pollution, I will take the proportion of non-retail business and multiply it by the nitrous oxide concentration to get an estimate on areas where the non-retail business acreage is occupied by polluting industries.

After reviewing the data we noticed that a large number of the zn records = 0. So we will create a dummy variable called zndum to show if the zn had >0 of a proportion of zoning for large lots.

Since the number of rooms can vary greatly between areas and is a continuous variable we will log transform that variable to smooth it out.

The rad index for accessibility to radial highways seems to have a lot of values under 10 and a lot of values at 24 so I will create a dummy variable called raddum to put these values into buckets.

In an attempt to find schools that have insufficient support and funding I will use the ptratio and lstat variables to determine areas with ladults with lower educaitonal attainment and low pupil to teacher ratios.

library(caTools)

tr$target <- as.factor(tr$target)
tr$cfas <- as.factor(tr$chas)

tr$business<-tr$tax*(1-tr$indus)
tr$apartment<-tr$rm/tr$tax
tr$pollution<-tr$nox*tr$indus
tr$zndum <- ifelse(tr$zn>0,1,0)
tr$rmlog<-log(tr$rm)
tr$raddum <- ifelse(tr$rad>23,1,0)
tr$lstatptratio<-((1-tr$lstat)*tr$ptratio)#/tr$rm

correlation_table(data=tr, target="target")
##        Variable target
## 1        target   1.00
## 2           nox   0.73
## 3     pollution   0.65
## 4           age   0.63
## 5           rad   0.63
## 6           tax   0.61
## 7         indus   0.60
## 8         lstat   0.47
## 9       ptratio   0.25
## 10           rm  -0.16
## 11        rmlog  -0.18
## 12         medv  -0.27
## 13           zn  -0.43
## 14 lstatptratio  -0.49
## 15    apartment  -0.56
## 16          dis  -0.62
## 17     business  -0.64
#https://stackoverflow.com/questions/17200114/how-to-split-data-into-training-testing-sets-using-sample-function

set.seed(2154)
## 75% of the sample size
smp_size <- floor(0.75 * nrow(tr))

## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(tr)), size = smp_size)

train <- tr[train_ind, ]
test <- tr[-train_ind, ]

Base model

Below is a model of all variable including those created to get a benchmark. We will then work backward and only select the most impactful variables

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
base <- glm(target~., data = train, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(base)
## 
## Call:
## glm(formula = target ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8707  -0.0804  -0.0001   0.0001   4.3397  
## 
## Coefficients: (1 not defined because of singularities)
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.360e+02  7.992e+01   1.702 0.088715 .  
## zn            1.293e-02  1.180e-01   0.110 0.912802    
## indus        -6.650e-01  7.314e-01  -0.909 0.363274    
## chas         -6.164e-01  1.223e+00  -0.504 0.614235    
## nox           6.630e+01  2.410e+01   2.752 0.005930 ** 
## rm            2.727e+01  1.489e+01   1.831 0.067149 .  
## age           1.111e-02  1.919e-02   0.579 0.562577    
## dis           2.431e-01  5.275e-01   0.461 0.644866    
## rad           7.327e-01  3.114e-01   2.353 0.018614 *  
## tax          -2.843e-01  7.059e-02  -4.027 5.64e-05 ***
## ptratio      -1.143e-01  4.808e-01  -0.238 0.812168    
## lstat        -9.419e-01  7.022e-01  -1.341 0.179780    
## medv          1.620e-01  1.471e-01   1.101 0.270794    
## cfas1                NA         NA      NA       NA    
## business     -5.886e-03  2.008e-03  -2.931 0.003378 ** 
## apartment    -3.593e+03  1.031e+03  -3.485 0.000492 ***
## pollution    -2.085e+00  1.715e+00  -1.216 0.223997    
## zndum        -8.640e-01  3.037e+00  -0.285 0.776000    
## rmlog        -1.013e+02  8.706e+01  -1.164 0.244599    
## raddum        3.091e+01  1.466e+03   0.021 0.983177    
## lstatptratio -5.498e-02  3.791e-02  -1.450 0.146948    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 482.300  on 348  degrees of freedom
## Residual deviance:  89.478  on 329  degrees of freedom
## AIC: 129.48
## 
## Number of Fisher Scoring iterations: 19
back <- glm(target~nox+rm+rad+tax+ptratio+medv+business+apartment+pollution+rmlog, data = train, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(back)
## 
## Call:
## glm(formula = target ~ nox + rm + rad + tax + ptratio + medv + 
##     business + apartment + pollution + rmlog, family = "binomial", 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6390  -0.0851  -0.0001   0.0870   4.1104  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.613e+02  7.707e+01   2.093  0.03633 *  
## nox          7.945e+01  1.639e+01   4.847 1.25e-06 ***
## rm           3.640e+01  1.519e+01   2.396  0.01658 *  
## rad          1.197e+00  2.157e-01   5.552 2.83e-08 ***
## tax         -2.460e-01  5.228e-02  -4.706 2.53e-06 ***
## ptratio      5.829e-01  1.871e-01   3.116  0.00183 ** 
## medv         4.876e-02  9.397e-02   0.519  0.60380    
## business    -5.902e-03  1.424e-03  -4.144 3.41e-05 ***
## apartment   -2.869e+03  6.204e+02  -4.625 3.75e-06 ***
## pollution   -3.205e+00  8.454e-01  -3.791  0.00015 ***
## rmlog       -1.712e+02  8.919e+01  -1.919  0.05499 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 482.30  on 348  degrees of freedom
## Residual deviance: 100.24  on 338  degrees of freedom
## AIC: 122.24
## 
## Number of Fisher Scoring iterations: 8
step <- glm(target~., data = train, family = "binomial") %>%
  stepAIC(trace = FALSE)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(step)
## 
## Call:
## glm(formula = target ~ nox + rm + rad + tax + lstat + business + 
##     apartment + pollution + rmlog + raddum + lstatptratio, family = "binomial", 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8094  -0.0819  -0.0001   0.0001   4.2885  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.843e+02  8.072e+01   2.283 0.022436 *  
## nox           7.625e+01  1.644e+01   4.637 3.54e-06 ***
## rm            3.885e+01  1.513e+01   2.568 0.010238 *  
## rad           8.595e-01  2.480e-01   3.466 0.000528 ***
## tax          -2.841e-01  6.266e-02  -4.534 5.80e-06 ***
## lstat        -7.155e-01  2.413e-01  -2.965 0.003026 ** 
## business     -5.901e-03  1.517e-03  -3.889 0.000101 ***
## apartment    -3.569e+03  9.549e+02  -3.737 0.000186 ***
## pollution    -3.219e+00  9.513e-01  -3.383 0.000716 ***
## rmlog        -1.692e+02  8.664e+01  -1.953 0.050804 .  
## raddum        2.785e+01  1.518e+03   0.018 0.985360    
## lstatptratio -4.072e-02  1.302e-02  -3.129 0.001757 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 482.300  on 348  degrees of freedom
## Residual deviance:  92.176  on 337  degrees of freedom
## AIC: 116.18
## 
## Number of Fisher Scoring iterations: 19

After working backwards we see the removal of a few variables does not lower with the residual deviance being much lower than the null deviance. We see that the stepwise AIC variable selection selects similar fields to those we selected while working backwards which has the lowest AIC of the 3 models.

##Validating our Best Model

test$business<-test$tax*(1-test$indus)
test$apartment<-test$rm/test$tax
test$pollution<-test$nox*test$indus
test$zndum <- ifelse(test$zn>0,1,0)
test$rmlog<-log(test$rm)
test$raddum <- ifelse(test$rad>23,1,0)
test$lstatptratio<-((1-test$lstat)*test$ptratio)#/test$rm

test$predictions<-predict(step, test, type="response")
test$predicted =  as.factor(ifelse(test$predictions >= 0.5, 1, 0))


library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
## The following object is masked from 'package:purrr':
## 
##     lift
confusionMatrix(test$predicted, test$target, positive = '1')
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(test$predicted, test$target, positive = "1"):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1 1}
##         0  50  1  0
##         1   1 65  0
##         1}  0  0  0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9829          
##                  95% CI : (0.9396, 0.9979)
##     No Information Rate : 0.5641          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9652          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 1}
## Sensitivity            0.9804   0.9848        NA
## Specificity            0.9848   0.9804         1
## Pos Pred Value         0.9804   0.9848        NA
## Neg Pred Value         0.9848   0.9804        NA
## Prevalence             0.4359   0.5641         0
## Detection Rate         0.4274   0.5556         0
## Detection Prevalence   0.4359   0.5641         0
## Balanced Accuracy      0.9826   0.9826        NA
proc = roc(test$target, test$predictions)
## Warning in roc.default(test$target, test$predictions): 'response' has more
## than two levels. Consider setting 'levels' explicitly or using 'multiclass.roc'
## instead
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(proc)

print(proc$auc)
## Area under the curve: 0.9997

We find the stepwise model to be 98.29% accurate, have a very low p-value and auc = 0.9997.

##Using Our Model to Predict the Evaluation Dataset

Our predictions for the evaluation dataset are now contained in the Predicted variable within the ev dataframe.

ev$business<-ev$tax*(1-ev$indus)
ev$apartment<-ev$rm/ev$tax
ev$pollution<-ev$nox*ev$indus
ev$zndum <- ifelse(ev$zn>0,1,0)
ev$rmlog<-log(ev$rm)
ev$raddum <- ifelse(ev$rad>23,1,0)
ev$lstatptratio<-((1-ev$lstat)*ev$ptratio)#/ev$rm

ev$predictions<-predict(step, ev, type="response")
ev$predicted =  as.factor(ifelse(ev$predictions >= 0.5, 1, 0))

write.csv(ev, file = "evaluation_predictions.csv")