#===============================================================================================================================
# Multivariate Logistic Regression Model
# Machine Learning for Data Science--spring 2017
# Namita J. Kadam #RUID: 174004689
#===============================================================================================================================

# Question:
# Using the Boston dataset, develop multivariate logistic regression models in order to predict whether a given suburb has a crime rate below or above the median. Describe your findings. The dataset is located in MASS package.

#Installing the required package:

require(MASS)
## Loading required package: MASS
#Loading the Boston dataset:

data("Boston")

attach(Boston)

dim(Boston)
## [1] 506  14
summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
# Creating the variable to store the crime rate below or above the median (1: above the median, 0: below the median)

Pcrime <- rep(0,length(crim))

Pcrime[crim > median(crim)] <- 1

# Copying Boston data into another dataframe:

Bos <- data.frame(Boston)

# Adding the response variable column (Pcrime) created above in this dataframe:

Bos <- data.frame(Boston, Pcrime)

# Converting the necessary variables into factor variables:

Bos$chas <- factor(Bos$chas)

Bos$Pcrime <- factor(Bos$Pcrime)

# Splitting the data into training-testing data for better prediction:

set.seed(174004689)

BosTrain <- sample(nrow(Bos),as.integer(nrow(Bos)*0.50))

train.bos = Bos[BosTrain,]

test.bos = Bos[-BosTrain,]

crimebos = test.bos$Pcrime

# Building a multivariate logistic regression model:

bosmodel = glm(Pcrime ~ . - crim, data = train.bos, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(bosmodel)
## 
## Call:
## glm(formula = Pcrime ~ . - crim, family = "binomial", data = train.bos)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.10335  -0.04751   0.00000   0.00004   2.35014  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -42.371661  15.657336  -2.706  0.00681 ** 
## zn           -0.095565   0.061402  -1.556  0.11961    
## indus        -0.124483   0.086293  -1.443  0.14914    
## chas1         1.664381   1.444917   1.152  0.24937    
## nox          70.836337  17.479096   4.053 5.06e-05 ***
## rm           -0.701962   1.397997  -0.502  0.61558    
## age           0.009478   0.020741   0.457  0.64770    
## dis           0.718481   0.368414   1.950  0.05115 .  
## rad           0.945530   0.301683   3.134  0.00172 ** 
## tax          -0.004477   0.004858  -0.922  0.35672    
## ptratio       0.581702   0.257537   2.259  0.02390 *  
## black        -0.040465   0.022128  -1.829  0.06745 .  
## lstat         0.159424   0.090200   1.767  0.07715 .  
## medv          0.283050   0.138432   2.045  0.04089 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 350.729  on 252  degrees of freedom
## Residual deviance:  74.698  on 239  degrees of freedom
## AIC: 102.7
## 
## Number of Fisher Scoring iterations: 10
# We see that nox, rad, dis, ptratio and medv seem to be statistically significant in the model

predbos = predict(bosmodel,test.bos, type = "response")

predbos.glm <- rep(0, length(predbos))

predbos.glm[predbos > 0.5] <- 1

accuracy <- table(predbos.glm, crimebos)

# Error rate of the model:

mean(predbos.glm != crimebos)
## [1] 0.1225296
# Accuracy of the model:

sum(diag(accuracy))/sum(accuracy)
## [1] 0.8774704
require(caret)
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(predbos.glm, crimebos)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 112  16
##          1  15 110
##                                           
##                Accuracy : 0.8775          
##                  95% CI : (0.8306, 0.9152)
##     No Information Rate : 0.502           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7549          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8819          
##             Specificity : 0.8730          
##          Pos Pred Value : 0.8750          
##          Neg Pred Value : 0.8800          
##              Prevalence : 0.5020          
##          Detection Rate : 0.4427          
##    Detection Prevalence : 0.5059          
##       Balanced Accuracy : 0.8775          
##                                           
##        'Positive' Class : 0               
## 
# Building a multivariate logistic regression model using only the significant variables:

bosmodel1 = glm(Pcrime ~ nox  + rad + medv + ptratio , data = train.bos, family = "binomial")

summary(bosmodel1)  
## 
## Call:
## glm(formula = Pcrime ~ nox + rad + medv + ptratio, family = "binomial", 
##     data = train.bos)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.84070  -0.16811   0.00001   0.00059   2.56738  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -37.05472    7.59917  -4.876 1.08e-06 ***
## nox          44.66515    8.14935   5.481 4.23e-08 ***
## rad           0.72388    0.18662   3.879 0.000105 ***
## medv          0.09681    0.04132   2.343 0.019147 *  
## ptratio       0.38664    0.16277   2.375 0.017533 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 350.73  on 252  degrees of freedom
## Residual deviance: 100.82  on 248  degrees of freedom
## AIC: 110.82
## 
## Number of Fisher Scoring iterations: 9
predbos = predict(bosmodel1,test.bos, type = "response")

predbos.glm <- rep(0, length(predbos))

predbos.glm[predbos > 0.5] <- 1

accuracy <- table(predbos.glm, crimebos)

accuracy
##            crimebos
## predbos.glm   0   1
##           0 112  25
##           1  15 101
# Error rate of the model:

mean(predbos.glm != crimebos)
## [1] 0.1581028
# Accuracy of the model:

sum(diag(accuracy))/sum(accuracy)
## [1] 0.8418972
# Wee see that the accuracy for this model with statistically significant variables is lower than the previous model.

require(caret)

confusionMatrix(predbos.glm, crimebos)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 112  25
##          1  15 101
##                                          
##                Accuracy : 0.8419         
##                  95% CI : (0.791, 0.8846)
##     No Information Rate : 0.502          
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.6837         
##  Mcnemar's Test P-Value : 0.1547         
##                                          
##             Sensitivity : 0.8819         
##             Specificity : 0.8016         
##          Pos Pred Value : 0.8175         
##          Neg Pred Value : 0.8707         
##              Prevalence : 0.5020         
##          Detection Rate : 0.4427         
##    Detection Prevalence : 0.5415         
##       Balanced Accuracy : 0.8417         
##                                          
##        'Positive' Class : 0              
## 
# Thus, in both the models, we can see that the a crime rate has majoritarily predicted to be below the mean.