DATA 621 – Business Analytics and Data Mining
Homework #3 Assignment Requirements
In this homework assignment, you will explore, analyze and model a data set containing information on crime for various neighborhoods of a major city. Each record has a response variable indicating whether or not the crime rate is above the median crime rate (1) or not (0).
Your objective is to build a binary logistic regression model on the training data set to predict whether the neighborhood will be at risk for high crime levels. You will provide classifications and probabilities for the evaluation data set using your binary logistic regression model. You can only use the variables given to you (or variables that you derive from the variables provided). Below is a short description of the variables of interest in the data set
<- read.csv("https://raw.githubusercontent.com/johnm1990/DATA621/main/hw3/crime-evaluation-data_modified.csv")
crime_eval_df <- read.csv("https://raw.githubusercontent.com/johnm1990/DATA621/main/hw3/crime-training-data_modified.csv") crime_train_df
head(crime_train_df, 1)
## zn indus chas nox rm age dis rad tax ptratio lstat medv target
## 1 0 19.58 0 0.605 7.929 96.2 2.0459 5 403 14.7 3.7 50 1
###summary statistics
summary(crime_train_df)
## zn indus chas nox
## Min. : 0.00 Min. : 0.460 Min. :0.00000 Min. :0.3890
## 1st Qu.: 0.00 1st Qu.: 5.145 1st Qu.:0.00000 1st Qu.:0.4480
## Median : 0.00 Median : 9.690 Median :0.00000 Median :0.5380
## Mean : 11.58 Mean :11.105 Mean :0.07082 Mean :0.5543
## 3rd Qu.: 16.25 3rd Qu.:18.100 3rd Qu.:0.00000 3rd Qu.:0.6240
## Max. :100.00 Max. :27.740 Max. :1.00000 Max. :0.8710
## rm age dis rad
## Min. :3.863 Min. : 2.90 Min. : 1.130 Min. : 1.00
## 1st Qu.:5.887 1st Qu.: 43.88 1st Qu.: 2.101 1st Qu.: 4.00
## Median :6.210 Median : 77.15 Median : 3.191 Median : 5.00
## Mean :6.291 Mean : 68.37 Mean : 3.796 Mean : 9.53
## 3rd Qu.:6.630 3rd Qu.: 94.10 3rd Qu.: 5.215 3rd Qu.:24.00
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.00
## tax ptratio lstat medv
## Min. :187.0 Min. :12.6 Min. : 1.730 Min. : 5.00
## 1st Qu.:281.0 1st Qu.:16.9 1st Qu.: 7.043 1st Qu.:17.02
## Median :334.5 Median :18.9 Median :11.350 Median :21.20
## Mean :409.5 Mean :18.4 Mean :12.631 Mean :22.59
## 3rd Qu.:666.0 3rd Qu.:20.2 3rd Qu.:16.930 3rd Qu.:25.00
## Max. :711.0 Max. :22.0 Max. :37.970 Max. :50.00
## target
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4914
## 3rd Qu.:1.0000
## Max. :1.0000
sapply(crime_train_df, sd, na.rm=TRUE)
## zn indus chas nox rm age
## 23.3646511 6.8458549 0.2567920 0.1166667 0.7048513 28.3213784
## dis rad tax ptratio lstat medv
## 2.1069496 8.6859272 167.9000887 2.1968447 7.1018907 9.2396814
## target
## 0.5004636
sapply(crime_train_df, hist, na.rm=TRUE)
## zn indus chas nox rm age
## breaks Numeric,11 Numeric,15 Numeric,11 Numeric,12 Numeric,12 Numeric,11
## counts Integer,10 Integer,14 Integer,10 Integer,11 Integer,11 Integer,10
## density Numeric,10 Numeric,14 Numeric,10 Numeric,11 Numeric,11 Numeric,10
## mids Numeric,10 Numeric,14 Numeric,10 Numeric,11 Numeric,11 Numeric,10
## xname "X[[i]]" "X[[i]]" "X[[i]]" "X[[i]]" "X[[i]]" "X[[i]]"
## equidist TRUE TRUE TRUE TRUE TRUE TRUE
## dis rad tax ptratio lstat medv
## breaks Integer,13 Numeric,13 Integer,13 Integer,11 Numeric,9 Integer,10
## counts Integer,12 Integer,12 Integer,12 Integer,10 Integer,8 Integer,9
## density Numeric,12 Numeric,12 Numeric,12 Numeric,10 Numeric,8 Numeric,9
## mids Numeric,12 Numeric,12 Numeric,12 Numeric,10 Numeric,8 Numeric,9
## xname "X[[i]]" "X[[i]]" "X[[i]]" "X[[i]]" "X[[i]]" "X[[i]]"
## equidist TRUE TRUE TRUE TRUE TRUE TRUE
## target
## breaks Numeric,11
## counts Integer,10
## density Numeric,10
## mids Numeric,10
## xname "X[[i]]"
## equidist TRUE
##correlation matrix
= rcorr(as.matrix(crime_train_df))
crime_train_df.rcorr crime_train_df.rcorr
## zn indus chas nox rm age dis rad tax ptratio lstat
## zn 1.00 -0.54 -0.04 -0.52 0.32 -0.57 0.66 -0.32 -0.32 -0.39 -0.43
## indus -0.54 1.00 0.06 0.76 -0.39 0.64 -0.70 0.60 0.73 0.39 0.61
## chas -0.04 0.06 1.00 0.10 0.09 0.08 -0.10 -0.02 -0.05 -0.13 -0.05
## nox -0.52 0.76 0.10 1.00 -0.30 0.74 -0.77 0.60 0.65 0.18 0.60
## rm 0.32 -0.39 0.09 -0.30 1.00 -0.23 0.20 -0.21 -0.30 -0.36 -0.63
## age -0.57 0.64 0.08 0.74 -0.23 1.00 -0.75 0.46 0.51 0.26 0.61
## dis 0.66 -0.70 -0.10 -0.77 0.20 -0.75 1.00 -0.49 -0.53 -0.23 -0.51
## rad -0.32 0.60 -0.02 0.60 -0.21 0.46 -0.49 1.00 0.91 0.47 0.50
## tax -0.32 0.73 -0.05 0.65 -0.30 0.51 -0.53 0.91 1.00 0.47 0.56
## ptratio -0.39 0.39 -0.13 0.18 -0.36 0.26 -0.23 0.47 0.47 1.00 0.38
## lstat -0.43 0.61 -0.05 0.60 -0.63 0.61 -0.51 0.50 0.56 0.38 1.00
## medv 0.38 -0.50 0.16 -0.43 0.71 -0.38 0.26 -0.40 -0.49 -0.52 -0.74
## target -0.43 0.60 0.08 0.73 -0.15 0.63 -0.62 0.63 0.61 0.25 0.47
## medv target
## zn 0.38 -0.43
## indus -0.50 0.60
## chas 0.16 0.08
## nox -0.43 0.73
## rm 0.71 -0.15
## age -0.38 0.63
## dis 0.26 -0.62
## rad -0.40 0.63
## tax -0.49 0.61
## ptratio -0.52 0.25
## lstat -0.74 0.47
## medv 1.00 -0.27
## target -0.27 1.00
##
## n= 466
##
##
## P
## zn indus chas nox rm age dis rad tax ptratio
## zn 0.0000 0.3870 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## indus 0.0000 0.1874 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## chas 0.3870 0.1874 0.0355 0.0509 0.0890 0.0372 0.7321 0.3138 0.0054
## nox 0.0000 0.0000 0.0355 0.0000 0.0000 0.0000 0.0000 0.0000 0.0001
## rm 0.0000 0.0000 0.0509 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## age 0.0000 0.0000 0.0890 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## dis 0.0000 0.0000 0.0372 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## rad 0.0000 0.0000 0.7321 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## tax 0.0000 0.0000 0.3138 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## ptratio 0.0000 0.0000 0.0054 0.0001 0.0000 0.0000 0.0000 0.0000 0.0000
## lstat 0.0000 0.0000 0.2679 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## medv 0.0000 0.0000 0.0005 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## target 0.0000 0.0000 0.0843 0.0000 0.0010 0.0000 0.0000 0.0000 0.0000 0.0000
## lstat medv target
## zn 0.0000 0.0000 0.0000
## indus 0.0000 0.0000 0.0000
## chas 0.2679 0.0005 0.0843
## nox 0.0000 0.0000 0.0000
## rm 0.0000 0.0000 0.0010
## age 0.0000 0.0000 0.0000
## dis 0.0000 0.0000 0.0000
## rad 0.0000 0.0000 0.0000
## tax 0.0000 0.0000 0.0000
## ptratio 0.0000 0.0000 0.0000
## lstat 0.0000 0.0000
## medv 0.0000 0.0000
## target 0.0000 0.0000
= cor(crime_train_df)
crime_train_df.cor corrplot(crime_train_df.cor)
#correlation_matrix = crime_train_df.corr()
#correlation plot
= cor(crime_train_df)
crime_train_df.cor corrplot(crime_train_df.cor)
<- cor.test(crime_train_df$dis, crime_train_df$target)
cor_distarget cor_distarget
##
## Pearson's product-moment correlation
##
## data: crime_train_df$dis and crime_train_df$target
## t = -16.963, df = 464, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.6717579 -0.5592666
## sample estimates:
## cor
## -0.6186731
<- cor.test(crime_train_df$nox, crime_train_df$target)
cor_noxtarget cor_noxtarget
##
## Pearson's product-moment correlation
##
## data: crime_train_df$nox and crime_train_df$target
## t = 22.748, df = 464, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6801291 0.7663936
## sample estimates:
## cor
## 0.7261062
<- cor.test(crime_train_df$tax, crime_train_df$rad)
cor_taxrad cor_taxrad
##
## Pearson's product-moment correlation
##
## data: crime_train_df$tax and crime_train_df$rad
## t = 46.239, df = 464, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8888115 0.9214292
## sample estimates:
## cor
## 0.9064632
#ttests to look at difference in means between target = 0 and target = 1, maybe to help create buckets
<- t.test(crime_train_df$ptratio ~ crime_train_df$target)
Ttest_targetpt Ttest_targetpt
##
## Welch Two Sample t-test
##
## data: crime_train_df$ptratio by crime_train_df$target
## t = -5.5567, df = 426.42, p-value = 4.852e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.4906314 -0.7116312
## sample estimates:
## mean in group 0 mean in group 1
## 17.85738 18.95852
<- t.test(crime_train_df$nox ~ crime_train_df$target)
Ttest_targetnox Ttest_targetnox
##
## Welch Two Sample t-test
##
## data: crime_train_df$nox by crime_train_df$target
## t = -22.545, df = 356.06, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1840337 -0.1545020
## sample estimates:
## mean in group 0 mean in group 1
## 0.4711295 0.6403974
<- t.test(crime_train_df$ptratio ~ crime_train_df$target)
Ttest_targetpt Ttest_targetpt
##
## Welch Two Sample t-test
##
## data: crime_train_df$ptratio by crime_train_df$target
## t = -5.5567, df = 426.42, p-value = 4.852e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.4906314 -0.7116312
## sample estimates:
## mean in group 0 mean in group 1
## 17.85738 18.95852
<- t.test(crime_train_df$lstat ~ crime_train_df$target)
Ttest_targetlstat Ttest_targetlstat
##
## Welch Two Sample t-test
##
## data: crime_train_df$lstat by crime_train_df$target
## t = -11.364, df = 391.47, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -7.808908 -5.505503
## sample estimates:
## mean in group 0 mean in group 1
## 9.36000 16.01721
$target <- as.factor(crime_train_df$target)
crime_train_dfboxplot(crime_train_df$zn ~ crime_train_df$target)
hist(crime_train_df$zn)
table(crime_train_df$zn[crime_train_df$zn<10])
##
## 0
## 339
hist(crime_train_df$zn[crime_train_df$zn>0])
table(crime_train_df$target[crime_train_df$zn > 22])
##
## 0 1
## 81 0
'zn_hi'] = crime_train_df$zn > 22
crime_train_df[boxplot(crime_train_df$indus ~ crime_train_df$target)
hist(crime_train_df$indus)
table(crime_train_df$indus[crime_train_df$indus > 18])
##
## 18.1 19.58 21.89 25.65 27.74
## 121 28 14 5 5
table(crime_train_df$target[(crime_train_df$indus>18) & (crime_train_df$indus<20)])
##
## 0 1
## 0 149
table(crime_train_df$target[(crime_train_df$indus>20)])
##
## 0 1
## 11 13
#Make a dummy indicator for 18<indus<20
'indus19'] = crime_train_df$indus > 18 & crime_train_df$indus < 20
crime_train_df[
boxplot(crime_train_df$nox ~ crime_train_df$target)
boxplot(crime_train_df$rm ~ crime_train_df$target)
boxplot(crime_train_df$age ~ crime_train_df$target)
hist(crime_train_df$age[crime_train_df$target==0])
#That looks normal for the lower crime areas. How about the higher ones?
hist(crime_train_df$age[crime_train_df$target==1])
'sq_age'] = crime_train_df$age ** 2
crime_train_df[boxplot(crime_train_df$dis ~ crime_train_df$target)
sd(crime_train_df$dis[crime_train_df$target==1])
## [1] 1.078999
sd(crime_train_df$dis[crime_train_df$target==0])
## [1] 2.067398
hist(log(crime_train_df$dis))
hist(log(crime_train_df$dis[crime_train_df$target==0]))
hist(log(crime_train_df$dis[crime_train_df$target==1]))
#Taking the log normalizes the predictor conditioned on the response, so log_dis should be useful.
'log_dis'] = log(crime_train_df$dis)
crime_train_df[boxplot(crime_train_df$rad ~ crime_train_df$target)
sd(crime_train_df$rad[crime_train_df$target==1])
## [1] 9.514932
sd(crime_train_df$rad[crime_train_df$target==0])
## [1] 1.594359
hist(crime_train_df$rad)
table(crime_train_df$target[crime_train_df$rad>15])
##
## 0 1
## 0 121
mean(crime_train_df$indus19[crime_train_df$rad>8])
## [1] 1
#Yes, that's just redundancy. What about at the lower end of the range?
mean(crime_train_df$indus19[crime_train_df$rad<5])
## [1] 0
'rad5to8'] = 5 < crime_train_df$rad & crime_train_df$rad < 8
crime_train_df[
boxplot(crime_train_df$tax ~ crime_train_df$target)
sd(crime_train_df$tax[crime_train_df$target==1])
## [1] 166.6934
sd(crime_train_df$tax[crime_train_df$target==0])
## [1] 89.19775
#Very different variances
hist(crime_train_df$tax)
'log_tax'] = log(crime_train_df$tax)
crime_train_df[table(crime_train_df$tax[crime_train_df$tax>600])
##
## 666 711
## 121 5
#That's just weird, why 666?
table(crime_train_df$target[crime_train_df$tax == 666])
##
## 0 1
## 0 121
#That's the same table as rad > 15. More redundancy probably.
sum(crime_train_df$tax==666 & crime_train_df$rad>15)
## [1] 121
#What about the tax == 711:
table(crime_train_df$target[crime_train_df$tax==711])
##
## 0 1
## 5 0
#Will indus19 take care of those 5, so that we can ignore tax > 600 ?
sum(crime_train_df$tax==711 & crime_train_df$indus19)
## [1] 0
table(crime_train_df$target[crime_train_df$tax==711 & !crime_train_df$indus19])
##
## 0 1
## 5 0
'tax_666'] = crime_train_df$tax==666
crime_train_df[
boxplot(crime_train_df$ptratio ~ crime_train_df$target)
hist(crime_train_df$ptratio[crime_train_df$target==1])
table(crime_train_df$ptratio[crime_train_df$ptratio>19])
##
## 19.1 19.2 19.6 19.7 20.1 20.2 20.9 21 21.1 21.2 22
## 14 17 6 6 5 128 11 23 1 14 2
table(crime_train_df$target[crime_train_df$ptratio==20.2])
##
## 0 1
## 7 121
'pt_peak'] = crime_train_df$ptratio == 20.2
crime_train_df['log_ptrat'] = log(crime_train_df$ptratio)
crime_train_df[
boxplot(crime_train_df$lstat ~ crime_train_df$target)
hist(crime_train_df$lstat)
'log_lstat'] = log(crime_train_df$lstat)
crime_train_df[
boxplot(crime_train_df$medv ~ crime_train_df$target)
set.seed(123)
<- caret::createDataPartition(crime_train_df$target, p=0.90, list=FALSE)
split <- crime_train_df[split, ]
train <- crime_train_df[ -split, ]
validation #Step 1: Create a full model
<- glm(target ~ . , data = train, family = 'binomial')
model.full summary(model.full)
##
## Call:
## glm(formula = target ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.11946 -0.00003 0.00000 0.00000 2.17497
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.559e+03 4.252e+02 -3.666 0.000246 ***
## zn 9.557e-01 2.506e-01 3.814 0.000137 ***
## indus 1.914e-01 2.294e-01 0.835 0.403999
## chas -1.684e-01 1.447e+00 -0.116 0.907381
## nox 1.660e+02 4.040e+01 4.108 3.99e-05 ***
## rm -5.077e+00 2.018e+00 -2.516 0.011865 *
## age 4.220e-02 7.123e-02 0.592 0.553557
## dis -3.897e+00 2.215e+00 -1.760 0.078442 .
## rad 3.748e+00 1.015e+00 3.692 0.000222 ***
## tax -5.897e-01 1.864e-01 -3.163 0.001560 **
## ptratio -1.380e+01 5.227e+00 -2.639 0.008314 **
## lstat 7.078e-01 2.648e-01 2.673 0.007512 **
## medv 7.024e-01 2.237e-01 3.140 0.001690 **
## zn_hiTRUE -7.762e+01 1.994e+03 -0.039 0.968942
## indus19TRUE 3.594e+01 6.221e+03 0.006 0.995390
## sq_age -6.331e-06 5.901e-04 -0.011 0.991440
## log_dis 1.451e+01 7.928e+00 1.831 0.067143 .
## rad5to8TRUE -1.107e+01 3.085e+00 -3.589 0.000332 ***
## log_tax 1.691e+02 5.705e+01 2.964 0.003034 **
## tax_666TRUE 3.013e+00 1.987e+04 0.000 0.999879
## pt_peakTRUE -1.941e+01 1.873e+04 -0.001 0.999173
## log_ptrat 3.216e+02 1.052e+02 3.058 0.002228 **
## log_lstat -7.259e+00 3.263e+00 -2.225 0.026096 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 583.514 on 420 degrees of freedom
## Residual deviance: 71.971 on 398 degrees of freedom
## AIC: 117.97
##
## Number of Fisher Scoring iterations: 21
#Step 2: Create a backward model using the full model
<- model.full %>% stepAIC(direction = "backward", trace = FALSE)
model.backward summary(model.backward)
##
## Call:
## glm(formula = target ~ zn + nox + rm + age + dis + rad + tax +
## ptratio + lstat + medv + zn_hi + indus19 + log_dis + rad5to8 +
## log_tax + log_ptrat + log_lstat, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.19148 -0.00005 0.00000 0.00000 2.07352
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.604e+03 3.697e+02 -4.340 1.42e-05 ***
## zn 9.403e-01 2.148e-01 4.378 1.20e-05 ***
## nox 1.711e+02 3.774e+01 4.534 5.77e-06 ***
## rm -4.999e+00 1.963e+00 -2.546 0.010894 *
## age 4.196e-02 2.359e-02 1.779 0.075300 .
## dis -3.081e+00 1.464e+00 -2.105 0.035307 *
## rad 3.456e+00 9.410e-01 3.673 0.000240 ***
## tax -5.548e-01 1.316e-01 -4.216 2.49e-05 ***
## ptratio -1.647e+01 4.814e+00 -3.422 0.000621 ***
## lstat 7.394e-01 2.359e-01 3.134 0.001722 **
## medv 6.963e-01 2.184e-01 3.188 0.001434 **
## zn_hiTRUE -7.767e+01 3.314e+03 -0.023 0.981299
## indus19TRUE 3.451e+01 2.588e+03 0.013 0.989359
## log_dis 1.143e+01 5.903e+00 1.935 0.052931 .
## rad5to8TRUE -1.170e+01 2.881e+00 -4.062 4.87e-05 ***
## log_tax 1.612e+02 3.811e+01 4.229 2.35e-05 ***
## log_ptrat 3.668e+02 9.921e+01 3.697 0.000218 ***
## log_lstat -7.670e+00 3.035e+00 -2.527 0.011490 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 583.514 on 420 degrees of freedom
## Residual deviance: 73.737 on 403 degrees of freedom
## AIC: 109.74
##
## Number of Fisher Scoring iterations: 22
#Getting formula for the model
formula(model.backward)
## target ~ zn + nox + rm + age + dis + rad + tax + ptratio + lstat +
## medv + zn_hi + indus19 + log_dis + rad5to8 + log_tax + log_ptrat +
## log_lstat
# generating the predictors
=predict(model.backward, newdata = validation)
model.backward.pred >= 0.5] <- 1
model.backward.pred[model.backward.pred < 0.5] <- 0
model.backward.pred[model.backward.pred = as.factor(model.backward.pred)
model.backward.pred
# generating the confusion matrix
<- confusionMatrix(model.backward.pred, validation$target, mode = "everything")
model.backward.confusion.matrix model.backward.confusion.matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 23 1
## 1 0 21
##
## Accuracy : 0.9778
## 95% CI : (0.8823, 0.9994)
## No Information Rate : 0.5111
## P-Value [Acc > NIR] : 3.366e-12
##
## Kappa : 0.9555
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 1.0000
## Specificity : 0.9545
## Pos Pred Value : 0.9583
## Neg Pred Value : 1.0000
## Precision : 0.9583
## Recall : 1.0000
## F1 : 0.9787
## Prevalence : 0.5111
## Detection Rate : 0.5111
## Detection Prevalence : 0.5333
## Balanced Accuracy : 0.9773
##
## 'Positive' Class : 0
##
par(mfrow=c(1,1))
<- influence(model.backward)$hat
hvalues <- residuals(model.backward)/sqrt(1-hvalues)
stanresDeviance plot(hvalues,stanresDeviance,ylab="Standardized Deviance Residuals",xlab="Leverage Values",ylim=c(-3,3),xlim=c(-0.05,0.7))
abline(v=2*7/length(train),lty=2)
identify(hvalues,stanresDeviance,cex=0.75)
## integer(0)
Check the full dataset fit with the backward-fit last model.
= glm(formula = target ~ zn + nox + rm + age + dis + rad + tax +
fullFit + lstat + medv + zn_hi + indus19 + log_dis + rad5to8 +
ptratio + log_ptrat + log_lstat, family = "binomial", data = crime_train_df)
log_tax summary(fullFit)
##
## Call:
## glm(formula = target ~ zn + nox + rm + age + dis + rad + tax +
## ptratio + lstat + medv + zn_hi + indus19 + log_dis + rad5to8 +
## log_tax + log_ptrat + log_lstat, family = "binomial", data = crime_train_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.17788 -0.00004 0.00000 0.00000 2.19575
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.601e+03 3.539e+02 -4.523 6.10e-06 ***
## zn 9.397e-01 2.107e-01 4.459 8.24e-06 ***
## nox 1.713e+02 3.587e+01 4.775 1.80e-06 ***
## rm -4.703e+00 1.892e+00 -2.485 0.012938 *
## age 3.921e-02 2.227e-02 1.760 0.078360 .
## dis -2.872e+00 1.409e+00 -2.038 0.041557 *
## rad 3.461e+00 9.324e-01 3.712 0.000205 ***
## tax -5.718e-01 1.338e-01 -4.275 1.91e-05 ***
## ptratio -1.578e+01 4.337e+00 -3.638 0.000275 ***
## lstat 6.780e-01 2.180e-01 3.109 0.001874 **
## medv 7.165e-01 2.135e-01 3.356 0.000792 ***
## zn_hiTRUE -7.726e+01 3.090e+03 -0.025 0.980049
## indus19TRUE 3.426e+01 2.515e+03 0.014 0.989129
## log_dis 1.018e+01 5.605e+00 1.816 0.069391 .
## rad5to8TRUE -1.159e+01 2.786e+00 -4.160 3.18e-05 ***
## log_tax 1.656e+02 3.869e+01 4.279 1.87e-05 ***
## log_ptrat 3.531e+02 8.955e+01 3.943 8.05e-05 ***
## log_lstat -6.483e+00 2.809e+00 -2.308 0.021018 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.876 on 465 degrees of freedom
## Residual deviance: 78.236 on 448 degrees of freedom
## AIC: 114.24
##
## Number of Fisher Scoring iterations: 22
<- predict(model.backward, type = "response")
p <- prediction(predictions = p,labels=model.backward$y)
roc_pred
<- performance(roc_pred,"auc"); auc <- as.numeric(auc.tmp@y.values)
auc.tmp auc
## [1] 0.9950336
#plotting roc
<- performance(roc_pred , "tpr" , "fpr")
roc_perf plot(roc_perf,
colorize = TRUE,
print.cutoffs.at= seq(0,1,0.05),
text.adj=c(-0.2,1.7))