#===============================================================================================================================
# Multivariate Logistic Regression Model
# Machine Learning for Data Science--spring 2017
# Namita J. Kadam #RUID: 174004689
#===============================================================================================================================
# Question:
# Using the Boston dataset, develop multivariate logistic regression models in order to predict whether a given suburb has a crime rate below or above the median. Describe your findings. The dataset is located in MASS package.
#Installing the required package:
require(MASS)
## Loading required package: MASS
#Loading the Boston dataset:
data("Boston")
attach(Boston)
dim(Boston)
## [1] 506 14
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
# Creating the variable to store the crime rate below or above the median (1: above the median, 0: below the median)
Pcrime <- rep(0,length(crim))
Pcrime[crim > median(crim)] <- 1
# Copying Boston data into another dataframe:
Bos <- data.frame(Boston)
# Adding the response variable column (Pcrime) created above in this dataframe:
Bos <- data.frame(Boston, Pcrime)
# Converting the necessary variables into factor variables:
Bos$chas <- factor(Bos$chas)
Bos$Pcrime <- factor(Bos$Pcrime)
# Splitting the data into training-testing data for better prediction:
set.seed(174004689)
BosTrain <- sample(nrow(Bos),as.integer(nrow(Bos)*0.50))
train.bos = Bos[BosTrain,]
test.bos = Bos[-BosTrain,]
crimebos = test.bos$Pcrime
# Building a multivariate logistic regression model:
bosmodel = glm(Pcrime ~ . - crim, data = train.bos, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(bosmodel)
##
## Call:
## glm(formula = Pcrime ~ . - crim, family = "binomial", data = train.bos)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.10335 -0.04751 0.00000 0.00004 2.35014
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -42.371661 15.657336 -2.706 0.00681 **
## zn -0.095565 0.061402 -1.556 0.11961
## indus -0.124483 0.086293 -1.443 0.14914
## chas1 1.664381 1.444917 1.152 0.24937
## nox 70.836337 17.479096 4.053 5.06e-05 ***
## rm -0.701962 1.397997 -0.502 0.61558
## age 0.009478 0.020741 0.457 0.64770
## dis 0.718481 0.368414 1.950 0.05115 .
## rad 0.945530 0.301683 3.134 0.00172 **
## tax -0.004477 0.004858 -0.922 0.35672
## ptratio 0.581702 0.257537 2.259 0.02390 *
## black -0.040465 0.022128 -1.829 0.06745 .
## lstat 0.159424 0.090200 1.767 0.07715 .
## medv 0.283050 0.138432 2.045 0.04089 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 350.729 on 252 degrees of freedom
## Residual deviance: 74.698 on 239 degrees of freedom
## AIC: 102.7
##
## Number of Fisher Scoring iterations: 10
# We see that nox, rad, dis, ptratio and medv seem to be statistically significant in the model
predbos = predict(bosmodel,test.bos, type = "response")
predbos.glm <- rep(0, length(predbos))
predbos.glm[predbos > 0.5] <- 1
accuracy <- table(predbos.glm, crimebos)
# Error rate of the model:
mean(predbos.glm != crimebos)
## [1] 0.1225296
# Accuracy of the model:
sum(diag(accuracy))/sum(accuracy)
## [1] 0.8774704
require(caret)
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(predbos.glm, crimebos)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 112 16
## 1 15 110
##
## Accuracy : 0.8775
## 95% CI : (0.8306, 0.9152)
## No Information Rate : 0.502
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7549
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8819
## Specificity : 0.8730
## Pos Pred Value : 0.8750
## Neg Pred Value : 0.8800
## Prevalence : 0.5020
## Detection Rate : 0.4427
## Detection Prevalence : 0.5059
## Balanced Accuracy : 0.8775
##
## 'Positive' Class : 0
##
# Building a multivariate logistic regression model using only the significant variables:
bosmodel1 = glm(Pcrime ~ nox + rad + medv + ptratio , data = train.bos, family = "binomial")
summary(bosmodel1)
##
## Call:
## glm(formula = Pcrime ~ nox + rad + medv + ptratio, family = "binomial",
## data = train.bos)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.84070 -0.16811 0.00001 0.00059 2.56738
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -37.05472 7.59917 -4.876 1.08e-06 ***
## nox 44.66515 8.14935 5.481 4.23e-08 ***
## rad 0.72388 0.18662 3.879 0.000105 ***
## medv 0.09681 0.04132 2.343 0.019147 *
## ptratio 0.38664 0.16277 2.375 0.017533 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 350.73 on 252 degrees of freedom
## Residual deviance: 100.82 on 248 degrees of freedom
## AIC: 110.82
##
## Number of Fisher Scoring iterations: 9
predbos = predict(bosmodel1,test.bos, type = "response")
predbos.glm <- rep(0, length(predbos))
predbos.glm[predbos > 0.5] <- 1
accuracy <- table(predbos.glm, crimebos)
accuracy
## crimebos
## predbos.glm 0 1
## 0 112 25
## 1 15 101
# Error rate of the model:
mean(predbos.glm != crimebos)
## [1] 0.1581028
# Accuracy of the model:
sum(diag(accuracy))/sum(accuracy)
## [1] 0.8418972
# Wee see that the accuracy for this model with statistically significant variables is lower than the previous model.
require(caret)
confusionMatrix(predbos.glm, crimebos)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 112 25
## 1 15 101
##
## Accuracy : 0.8419
## 95% CI : (0.791, 0.8846)
## No Information Rate : 0.502
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6837
## Mcnemar's Test P-Value : 0.1547
##
## Sensitivity : 0.8819
## Specificity : 0.8016
## Pos Pred Value : 0.8175
## Neg Pred Value : 0.8707
## Prevalence : 0.5020
## Detection Rate : 0.4427
## Detection Prevalence : 0.5415
## Balanced Accuracy : 0.8417
##
## 'Positive' Class : 0
##
# Thus, in both the models, we can see that the a crime rate has majoritarily predicted to be below the mean.