DATA 621 – Business Analytics and Data Mining

Overview

Homework #3 Assignment Requirements
In this homework assignment, you will explore, analyze and model a data set containing information on crime for various neighborhoods of a major city. Each record has a response variable indicating whether or not the crime rate is above the median crime rate (1) or not (0).

Your objective is to build a binary logistic regression model on the training data set to predict whether the neighborhood will be at risk for high crime levels. You will provide classifications and probabilities for the evaluation data set using your binary logistic regression model. You can only use the variables given to you (or variables that you derive from the variables provided). Below is a short description of the variables of interest in the data set

crime_eval_df <- read.csv("https://raw.githubusercontent.com/johnm1990/DATA621/main/hw3/crime-evaluation-data_modified.csv")
crime_train_df <- read.csv("https://raw.githubusercontent.com/johnm1990/DATA621/main/hw3/crime-training-data_modified.csv")

1. Data Exploration

head(crime_train_df, 1)

##   zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv target
## 1  0 19.58    0 0.605 7.929 96.2 2.0459   5 403    14.7   3.7   50      1

###summary statistics
summary(crime_train_df)

##        zn             indus             chas              nox        
##  Min.   :  0.00   Min.   : 0.460   Min.   :0.00000   Min.   :0.3890  
##  1st Qu.:  0.00   1st Qu.: 5.145   1st Qu.:0.00000   1st Qu.:0.4480  
##  Median :  0.00   Median : 9.690   Median :0.00000   Median :0.5380  
##  Mean   : 11.58   Mean   :11.105   Mean   :0.07082   Mean   :0.5543  
##  3rd Qu.: 16.25   3rd Qu.:18.100   3rd Qu.:0.00000   3rd Qu.:0.6240  
##  Max.   :100.00   Max.   :27.740   Max.   :1.00000   Max.   :0.8710  
##        rm             age              dis              rad       
##  Min.   :3.863   Min.   :  2.90   Min.   : 1.130   Min.   : 1.00  
##  1st Qu.:5.887   1st Qu.: 43.88   1st Qu.: 2.101   1st Qu.: 4.00  
##  Median :6.210   Median : 77.15   Median : 3.191   Median : 5.00  
##  Mean   :6.291   Mean   : 68.37   Mean   : 3.796   Mean   : 9.53  
##  3rd Qu.:6.630   3rd Qu.: 94.10   3rd Qu.: 5.215   3rd Qu.:24.00  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.00  
##       tax           ptratio         lstat             medv      
##  Min.   :187.0   Min.   :12.6   Min.   : 1.730   Min.   : 5.00  
##  1st Qu.:281.0   1st Qu.:16.9   1st Qu.: 7.043   1st Qu.:17.02  
##  Median :334.5   Median :18.9   Median :11.350   Median :21.20  
##  Mean   :409.5   Mean   :18.4   Mean   :12.631   Mean   :22.59  
##  3rd Qu.:666.0   3rd Qu.:20.2   3rd Qu.:16.930   3rd Qu.:25.00  
##  Max.   :711.0   Max.   :22.0   Max.   :37.970   Max.   :50.00  
##      target      
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.4914  
##  3rd Qu.:1.0000  
##  Max.   :1.0000

sapply(crime_train_df, sd, na.rm=TRUE)

##          zn       indus        chas         nox          rm         age 
##  23.3646511   6.8458549   0.2567920   0.1166667   0.7048513  28.3213784 
##         dis         rad         tax     ptratio       lstat        medv 
##   2.1069496   8.6859272 167.9000887   2.1968447   7.1018907   9.2396814 
##      target 
##   0.5004636

sapply(crime_train_df, hist, na.rm=TRUE)

##          zn         indus      chas       nox        rm         age       
## breaks   Numeric,11 Numeric,15 Numeric,11 Numeric,12 Numeric,12 Numeric,11
## counts   Integer,10 Integer,14 Integer,10 Integer,11 Integer,11 Integer,10
## density  Numeric,10 Numeric,14 Numeric,10 Numeric,11 Numeric,11 Numeric,10
## mids     Numeric,10 Numeric,14 Numeric,10 Numeric,11 Numeric,11 Numeric,10
## xname    "X[[i]]"   "X[[i]]"   "X[[i]]"   "X[[i]]"   "X[[i]]"   "X[[i]]"  
## equidist TRUE       TRUE       TRUE       TRUE       TRUE       TRUE      
##          dis        rad        tax        ptratio    lstat     medv      
## breaks   Integer,13 Numeric,13 Integer,13 Integer,11 Numeric,9 Integer,10
## counts   Integer,12 Integer,12 Integer,12 Integer,10 Integer,8 Integer,9 
## density  Numeric,12 Numeric,12 Numeric,12 Numeric,10 Numeric,8 Numeric,9 
## mids     Numeric,12 Numeric,12 Numeric,12 Numeric,10 Numeric,8 Numeric,9 
## xname    "X[[i]]"   "X[[i]]"   "X[[i]]"   "X[[i]]"   "X[[i]]"  "X[[i]]"  
## equidist TRUE       TRUE       TRUE       TRUE       TRUE      TRUE      
##          target    
## breaks   Numeric,11
## counts   Integer,10
## density  Numeric,10
## mids     Numeric,10
## xname    "X[[i]]"  
## equidist TRUE

##correlation matrix
crime_train_df.rcorr = rcorr(as.matrix(crime_train_df))
crime_train_df.rcorr

##            zn indus  chas   nox    rm   age   dis   rad   tax ptratio lstat
## zn       1.00 -0.54 -0.04 -0.52  0.32 -0.57  0.66 -0.32 -0.32   -0.39 -0.43
## indus   -0.54  1.00  0.06  0.76 -0.39  0.64 -0.70  0.60  0.73    0.39  0.61
## chas    -0.04  0.06  1.00  0.10  0.09  0.08 -0.10 -0.02 -0.05   -0.13 -0.05
## nox     -0.52  0.76  0.10  1.00 -0.30  0.74 -0.77  0.60  0.65    0.18  0.60
## rm       0.32 -0.39  0.09 -0.30  1.00 -0.23  0.20 -0.21 -0.30   -0.36 -0.63
## age     -0.57  0.64  0.08  0.74 -0.23  1.00 -0.75  0.46  0.51    0.26  0.61
## dis      0.66 -0.70 -0.10 -0.77  0.20 -0.75  1.00 -0.49 -0.53   -0.23 -0.51
## rad     -0.32  0.60 -0.02  0.60 -0.21  0.46 -0.49  1.00  0.91    0.47  0.50
## tax     -0.32  0.73 -0.05  0.65 -0.30  0.51 -0.53  0.91  1.00    0.47  0.56
## ptratio -0.39  0.39 -0.13  0.18 -0.36  0.26 -0.23  0.47  0.47    1.00  0.38
## lstat   -0.43  0.61 -0.05  0.60 -0.63  0.61 -0.51  0.50  0.56    0.38  1.00
## medv     0.38 -0.50  0.16 -0.43  0.71 -0.38  0.26 -0.40 -0.49   -0.52 -0.74
## target  -0.43  0.60  0.08  0.73 -0.15  0.63 -0.62  0.63  0.61    0.25  0.47
##          medv target
## zn       0.38  -0.43
## indus   -0.50   0.60
## chas     0.16   0.08
## nox     -0.43   0.73
## rm       0.71  -0.15
## age     -0.38   0.63
## dis      0.26  -0.62
## rad     -0.40   0.63
## tax     -0.49   0.61
## ptratio -0.52   0.25
## lstat   -0.74   0.47
## medv     1.00  -0.27
## target  -0.27   1.00
## 
## n= 466 
## 
## 
## P
##         zn     indus  chas   nox    rm     age    dis    rad    tax    ptratio
## zn             0.0000 0.3870 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
## indus   0.0000        0.1874 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
## chas    0.3870 0.1874        0.0355 0.0509 0.0890 0.0372 0.7321 0.3138 0.0054 
## nox     0.0000 0.0000 0.0355        0.0000 0.0000 0.0000 0.0000 0.0000 0.0001 
## rm      0.0000 0.0000 0.0509 0.0000        0.0000 0.0000 0.0000 0.0000 0.0000 
## age     0.0000 0.0000 0.0890 0.0000 0.0000        0.0000 0.0000 0.0000 0.0000 
## dis     0.0000 0.0000 0.0372 0.0000 0.0000 0.0000        0.0000 0.0000 0.0000 
## rad     0.0000 0.0000 0.7321 0.0000 0.0000 0.0000 0.0000        0.0000 0.0000 
## tax     0.0000 0.0000 0.3138 0.0000 0.0000 0.0000 0.0000 0.0000        0.0000 
## ptratio 0.0000 0.0000 0.0054 0.0001 0.0000 0.0000 0.0000 0.0000 0.0000        
## lstat   0.0000 0.0000 0.2679 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
## medv    0.0000 0.0000 0.0005 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 
## target  0.0000 0.0000 0.0843 0.0000 0.0010 0.0000 0.0000 0.0000 0.0000 0.0000 
##         lstat  medv   target
## zn      0.0000 0.0000 0.0000
## indus   0.0000 0.0000 0.0000
## chas    0.2679 0.0005 0.0843
## nox     0.0000 0.0000 0.0000
## rm      0.0000 0.0000 0.0010
## age     0.0000 0.0000 0.0000
## dis     0.0000 0.0000 0.0000
## rad     0.0000 0.0000 0.0000
## tax     0.0000 0.0000 0.0000
## ptratio 0.0000 0.0000 0.0000
## lstat          0.0000 0.0000
## medv    0.0000        0.0000
## target  0.0000 0.0000

crime_train_df.cor = cor(crime_train_df)
corrplot(crime_train_df.cor)
#correlation_matrix = crime_train_df.corr()


#correlation plot
crime_train_df.cor = cor(crime_train_df)
corrplot(crime_train_df.cor)

cor_distarget <- cor.test(crime_train_df$dis, crime_train_df$target)
cor_distarget

## 
##  Pearson's product-moment correlation
## 
## data:  crime_train_df$dis and crime_train_df$target
## t = -16.963, df = 464, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.6717579 -0.5592666
## sample estimates:
##        cor 
## -0.6186731

cor_noxtarget <- cor.test(crime_train_df$nox, crime_train_df$target)
cor_noxtarget

## 
##  Pearson's product-moment correlation
## 
## data:  crime_train_df$nox and crime_train_df$target
## t = 22.748, df = 464, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6801291 0.7663936
## sample estimates:
##       cor 
## 0.7261062

cor_taxrad<- cor.test(crime_train_df$tax, crime_train_df$rad)
cor_taxrad

## 
##  Pearson's product-moment correlation
## 
## data:  crime_train_df$tax and crime_train_df$rad
## t = 46.239, df = 464, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8888115 0.9214292
## sample estimates:
##       cor 
## 0.9064632

#ttests to look at difference in means between target = 0 and target = 1, maybe to help create buckets
Ttest_targetpt<- t.test(crime_train_df$ptratio ~ crime_train_df$target)
Ttest_targetpt

## 
##  Welch Two Sample t-test
## 
## data:  crime_train_df$ptratio by crime_train_df$target
## t = -5.5567, df = 426.42, p-value = 4.852e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.4906314 -0.7116312
## sample estimates:
## mean in group 0 mean in group 1 
##        17.85738        18.95852

Ttest_targetnox<- t.test(crime_train_df$nox ~ crime_train_df$target)
Ttest_targetnox

## 
##  Welch Two Sample t-test
## 
## data:  crime_train_df$nox by crime_train_df$target
## t = -22.545, df = 356.06, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1840337 -0.1545020
## sample estimates:
## mean in group 0 mean in group 1 
##       0.4711295       0.6403974

Ttest_targetpt<- t.test(crime_train_df$ptratio ~ crime_train_df$target)
Ttest_targetpt

## 
##  Welch Two Sample t-test
## 
## data:  crime_train_df$ptratio by crime_train_df$target
## t = -5.5567, df = 426.42, p-value = 4.852e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.4906314 -0.7116312
## sample estimates:
## mean in group 0 mean in group 1 
##        17.85738        18.95852

Ttest_targetlstat<- t.test(crime_train_df$lstat ~ crime_train_df$target)
Ttest_targetlstat

## 
##  Welch Two Sample t-test
## 
## data:  crime_train_df$lstat by crime_train_df$target
## t = -11.364, df = 391.47, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -7.808908 -5.505503
## sample estimates:
## mean in group 0 mean in group 1 
##         9.36000        16.01721

2. Data Preparation

crime_train_df$target <-  as.factor(crime_train_df$target)
boxplot(crime_train_df$zn ~ crime_train_df$target)

hist(crime_train_df$zn)

table(crime_train_df$zn[crime_train_df$zn<10])

## 
##   0 
## 339

hist(crime_train_df$zn[crime_train_df$zn>0])

table(crime_train_df$target[crime_train_df$zn > 22])

## 
##  0  1 
## 81  0

crime_train_df['zn_hi'] = crime_train_df$zn > 22
boxplot(crime_train_df$indus ~ crime_train_df$target)

hist(crime_train_df$indus)

table(crime_train_df$indus[crime_train_df$indus > 18])

## 
##  18.1 19.58 21.89 25.65 27.74 
##   121    28    14     5     5

table(crime_train_df$target[(crime_train_df$indus>18) & (crime_train_df$indus<20)])

## 
##   0   1 
##   0 149

table(crime_train_df$target[(crime_train_df$indus>20)])

## 
##  0  1 
## 11 13

#Make a dummy indicator for 18<indus<20  

crime_train_df['indus19'] = crime_train_df$indus > 18 & crime_train_df$indus < 20
 

boxplot(crime_train_df$nox ~ crime_train_df$target)

boxplot(crime_train_df$rm ~ crime_train_df$target)

boxplot(crime_train_df$age ~ crime_train_df$target)

hist(crime_train_df$age[crime_train_df$target==0])

#That looks normal for the lower crime areas.  How about the higher ones?  

hist(crime_train_df$age[crime_train_df$target==1])

crime_train_df['sq_age'] = crime_train_df$age ** 2
boxplot(crime_train_df$dis ~ crime_train_df$target)

sd(crime_train_df$dis[crime_train_df$target==1])

## [1] 1.078999

sd(crime_train_df$dis[crime_train_df$target==0])

## [1] 2.067398

hist(log(crime_train_df$dis))

hist(log(crime_train_df$dis[crime_train_df$target==0]))

hist(log(crime_train_df$dis[crime_train_df$target==1]))

#Taking the log normalizes the predictor conditioned on the response, so log_dis should be useful.

crime_train_df['log_dis'] = log(crime_train_df$dis)
boxplot(crime_train_df$rad ~ crime_train_df$target)

sd(crime_train_df$rad[crime_train_df$target==1])

## [1] 9.514932

sd(crime_train_df$rad[crime_train_df$target==0])

## [1] 1.594359

hist(crime_train_df$rad)

table(crime_train_df$target[crime_train_df$rad>15])

## 
##   0   1 
##   0 121

mean(crime_train_df$indus19[crime_train_df$rad>8])

## [1] 1

#Yes, that's just redundancy.  What about at the lower end of the range? 
 
mean(crime_train_df$indus19[crime_train_df$rad<5])

## [1] 0

crime_train_df['rad5to8'] = 5 < crime_train_df$rad & crime_train_df$rad < 8

boxplot(crime_train_df$tax ~ crime_train_df$target)

sd(crime_train_df$tax[crime_train_df$target==1])

## [1] 166.6934

sd(crime_train_df$tax[crime_train_df$target==0])

## [1] 89.19775

#Very different variances

hist(crime_train_df$tax)

crime_train_df['log_tax'] = log(crime_train_df$tax)
table(crime_train_df$tax[crime_train_df$tax>600])

## 
## 666 711 
## 121   5

#That's just weird, why 666?  

table(crime_train_df$target[crime_train_df$tax == 666])

## 
##   0   1 
##   0 121

#That's the same table as rad > 15.  More redundancy probably.  

sum(crime_train_df$tax==666 & crime_train_df$rad>15)

## [1] 121

#What about the tax == 711:


table(crime_train_df$target[crime_train_df$tax==711])

## 
## 0 1 
## 5 0

#Will indus19 take care of those 5, so that we can ignore tax > 600 ?  

sum(crime_train_df$tax==711 & crime_train_df$indus19)

## [1] 0

table(crime_train_df$target[crime_train_df$tax==711 & !crime_train_df$indus19])

## 
## 0 1 
## 5 0

crime_train_df['tax_666'] = crime_train_df$tax==666


boxplot(crime_train_df$ptratio ~ crime_train_df$target)

hist(crime_train_df$ptratio[crime_train_df$target==1])

table(crime_train_df$ptratio[crime_train_df$ptratio>19])

## 
## 19.1 19.2 19.6 19.7 20.1 20.2 20.9   21 21.1 21.2   22 
##   14   17    6    6    5  128   11   23    1   14    2

table(crime_train_df$target[crime_train_df$ptratio==20.2])

## 
##   0   1 
##   7 121

crime_train_df['pt_peak'] = crime_train_df$ptratio == 20.2
crime_train_df['log_ptrat'] = log(crime_train_df$ptratio)


boxplot(crime_train_df$lstat ~ crime_train_df$target)

hist(crime_train_df$lstat)

crime_train_df['log_lstat'] = log(crime_train_df$lstat)

boxplot(crime_train_df$medv ~ crime_train_df$target)

3. Build the Models

set.seed(123)
split <- caret::createDataPartition(crime_train_df$target, p=0.90, list=FALSE)
train <- crime_train_df[split, ]
validation <- crime_train_df[ -split, ]
#Step 1: Create a full model
model.full  <- glm(target ~ . , data = train, family = 'binomial')
summary(model.full)

## 
## Call:
## glm(formula = target ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.11946  -0.00003   0.00000   0.00000   2.17497  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.559e+03  4.252e+02  -3.666 0.000246 ***
## zn           9.557e-01  2.506e-01   3.814 0.000137 ***
## indus        1.914e-01  2.294e-01   0.835 0.403999    
## chas        -1.684e-01  1.447e+00  -0.116 0.907381    
## nox          1.660e+02  4.040e+01   4.108 3.99e-05 ***
## rm          -5.077e+00  2.018e+00  -2.516 0.011865 *  
## age          4.220e-02  7.123e-02   0.592 0.553557    
## dis         -3.897e+00  2.215e+00  -1.760 0.078442 .  
## rad          3.748e+00  1.015e+00   3.692 0.000222 ***
## tax         -5.897e-01  1.864e-01  -3.163 0.001560 ** 
## ptratio     -1.380e+01  5.227e+00  -2.639 0.008314 ** 
## lstat        7.078e-01  2.648e-01   2.673 0.007512 ** 
## medv         7.024e-01  2.237e-01   3.140 0.001690 ** 
## zn_hiTRUE   -7.762e+01  1.994e+03  -0.039 0.968942    
## indus19TRUE  3.594e+01  6.221e+03   0.006 0.995390    
## sq_age      -6.331e-06  5.901e-04  -0.011 0.991440    
## log_dis      1.451e+01  7.928e+00   1.831 0.067143 .  
## rad5to8TRUE -1.107e+01  3.085e+00  -3.589 0.000332 ***
## log_tax      1.691e+02  5.705e+01   2.964 0.003034 ** 
## tax_666TRUE  3.013e+00  1.987e+04   0.000 0.999879    
## pt_peakTRUE -1.941e+01  1.873e+04  -0.001 0.999173    
## log_ptrat    3.216e+02  1.052e+02   3.058 0.002228 ** 
## log_lstat   -7.259e+00  3.263e+00  -2.225 0.026096 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 583.514  on 420  degrees of freedom
## Residual deviance:  71.971  on 398  degrees of freedom
## AIC: 117.97
## 
## Number of Fisher Scoring iterations: 21

#Step 2: Create a backward model using the full model 
model.backward <- model.full %>% stepAIC(direction = "backward", trace = FALSE)
summary(model.backward)

## 
## Call:
## glm(formula = target ~ zn + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv + zn_hi + indus19 + log_dis + rad5to8 + 
##     log_tax + log_ptrat + log_lstat, family = "binomial", data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.19148  -0.00005   0.00000   0.00000   2.07352  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.604e+03  3.697e+02  -4.340 1.42e-05 ***
## zn           9.403e-01  2.148e-01   4.378 1.20e-05 ***
## nox          1.711e+02  3.774e+01   4.534 5.77e-06 ***
## rm          -4.999e+00  1.963e+00  -2.546 0.010894 *  
## age          4.196e-02  2.359e-02   1.779 0.075300 .  
## dis         -3.081e+00  1.464e+00  -2.105 0.035307 *  
## rad          3.456e+00  9.410e-01   3.673 0.000240 ***
## tax         -5.548e-01  1.316e-01  -4.216 2.49e-05 ***
## ptratio     -1.647e+01  4.814e+00  -3.422 0.000621 ***
## lstat        7.394e-01  2.359e-01   3.134 0.001722 ** 
## medv         6.963e-01  2.184e-01   3.188 0.001434 ** 
## zn_hiTRUE   -7.767e+01  3.314e+03  -0.023 0.981299    
## indus19TRUE  3.451e+01  2.588e+03   0.013 0.989359    
## log_dis      1.143e+01  5.903e+00   1.935 0.052931 .  
## rad5to8TRUE -1.170e+01  2.881e+00  -4.062 4.87e-05 ***
## log_tax      1.612e+02  3.811e+01   4.229 2.35e-05 ***
## log_ptrat    3.668e+02  9.921e+01   3.697 0.000218 ***
## log_lstat   -7.670e+00  3.035e+00  -2.527 0.011490 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 583.514  on 420  degrees of freedom
## Residual deviance:  73.737  on 403  degrees of freedom
## AIC: 109.74
## 
## Number of Fisher Scoring iterations: 22

#Getting formula for the model 
formula(model.backward)

## target ~ zn + nox + rm + age + dis + rad + tax + ptratio + lstat + 
##     medv + zn_hi + indus19 + log_dis + rad5to8 + log_tax + log_ptrat + 
##     log_lstat

4. Select Models

# generating the predictors
model.backward.pred =predict(model.backward, newdata = validation)
model.backward.pred[model.backward.pred >= 0.5] <- 1
model.backward.pred[model.backward.pred < 0.5] <- 0
model.backward.pred = as.factor(model.backward.pred)


#  generating the confusion matrix
model.backward.confusion.matrix <- confusionMatrix(model.backward.pred, validation$target, mode = "everything")
model.backward.confusion.matrix

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 23  1
##          1  0 21
##                                           
##                Accuracy : 0.9778          
##                  95% CI : (0.8823, 0.9994)
##     No Information Rate : 0.5111          
##     P-Value [Acc > NIR] : 3.366e-12       
##                                           
##                   Kappa : 0.9555          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9545          
##          Pos Pred Value : 0.9583          
##          Neg Pred Value : 1.0000          
##               Precision : 0.9583          
##                  Recall : 1.0000          
##                      F1 : 0.9787          
##              Prevalence : 0.5111          
##          Detection Rate : 0.5111          
##    Detection Prevalence : 0.5333          
##       Balanced Accuracy : 0.9773          
##                                           
##        'Positive' Class : 0               
##

par(mfrow=c(1,1))
hvalues <- influence(model.backward)$hat
stanresDeviance <- residuals(model.backward)/sqrt(1-hvalues)
plot(hvalues,stanresDeviance,ylab="Standardized Deviance Residuals",xlab="Leverage Values",ylim=c(-3,3),xlim=c(-0.05,0.7))
abline(v=2*7/length(train),lty=2)
identify(hvalues,stanresDeviance,cex=0.75)

## integer(0)

Check the full dataset fit with the backward-fit last model.

fullFit = glm(formula = target ~ zn + nox + rm + age + dis + rad + tax + 
    ptratio + lstat + medv + zn_hi + indus19 + log_dis + rad5to8 + 
    log_tax + log_ptrat + log_lstat, family = "binomial", data = crime_train_df)
summary(fullFit)

## 
## Call:
## glm(formula = target ~ zn + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv + zn_hi + indus19 + log_dis + rad5to8 + 
##     log_tax + log_ptrat + log_lstat, family = "binomial", data = crime_train_df)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.17788  -0.00004   0.00000   0.00000   2.19575  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.601e+03  3.539e+02  -4.523 6.10e-06 ***
## zn           9.397e-01  2.107e-01   4.459 8.24e-06 ***
## nox          1.713e+02  3.587e+01   4.775 1.80e-06 ***
## rm          -4.703e+00  1.892e+00  -2.485 0.012938 *  
## age          3.921e-02  2.227e-02   1.760 0.078360 .  
## dis         -2.872e+00  1.409e+00  -2.038 0.041557 *  
## rad          3.461e+00  9.324e-01   3.712 0.000205 ***
## tax         -5.718e-01  1.338e-01  -4.275 1.91e-05 ***
## ptratio     -1.578e+01  4.337e+00  -3.638 0.000275 ***
## lstat        6.780e-01  2.180e-01   3.109 0.001874 ** 
## medv         7.165e-01  2.135e-01   3.356 0.000792 ***
## zn_hiTRUE   -7.726e+01  3.090e+03  -0.025 0.980049    
## indus19TRUE  3.426e+01  2.515e+03   0.014 0.989129    
## log_dis      1.018e+01  5.605e+00   1.816 0.069391 .  
## rad5to8TRUE -1.159e+01  2.786e+00  -4.160 3.18e-05 ***
## log_tax      1.656e+02  3.869e+01   4.279 1.87e-05 ***
## log_ptrat    3.531e+02  8.955e+01   3.943 8.05e-05 ***
## log_lstat   -6.483e+00  2.809e+00  -2.308 0.021018 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.876  on 465  degrees of freedom
## Residual deviance:  78.236  on 448  degrees of freedom
## AIC: 114.24
## 
## Number of Fisher Scoring iterations: 22

p <- predict(model.backward, type = "response")
roc_pred <- prediction(predictions = p,labels=model.backward$y)

auc.tmp <- performance(roc_pred,"auc"); auc <- as.numeric(auc.tmp@y.values)
auc

## [1] 0.9950336

#plotting roc
roc_perf <- performance(roc_pred , "tpr" , "fpr")
plot(roc_perf,
     colorize = TRUE,
     print.cutoffs.at= seq(0,1,0.05),
     text.adj=c(-0.2,1.7))

Homework #3 Binary logistic regression models

Douglas Barley, Ethan Haley, Isabel Magnus, John Mazon, Vinayak Kamath, Arushi Arora

11/1/2021

Overview

1. Data Exploration

2. Data Preparation

3. Build the Models

4. Select Models