options(warn = -1)

suppressMessages(require(plotly))
library(knitr)
suppressMessages(library(RCurl))
suppressMessages(library(plyr))
suppressMessages(library(ggplot2))
suppressMessages(library(plotly))
suppressMessages(require(scatterplot3d))
suppressMessages(library(Amelia))
suppressMessages(library(ROCR))
suppressMessages(library(pROC));

crime <- read.csv("https://raw.githubusercontent.com/mascotinme/MSDA-IS621/master/crime-training-data.csv", header = TRUE, sep = ",")

crime_evaluation <- read.csv("https://raw.githubusercontent.com/mascotinme/MSDA-IS621/master/crime-evaluation-data.csv", header = TRUE, sep = ",")

Data Exploration:

Inferential Statistics

names(crime)
##  [1] "zn"      "indus"   "chas"    "nox"     "rm"      "age"     "dis"    
##  [8] "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"    "target"
str(crime)
## 'data.frame':    466 obs. of  14 variables:
##  $ zn     : num  0 0 0 30 0 0 0 0 0 80 ...
##  $ indus  : num  19.58 19.58 18.1 4.93 2.46 ...
##  $ chas   : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.605 0.871 0.74 0.428 0.488 0.52 0.693 0.693 0.515 0.392 ...
##  $ rm     : num  7.93 5.4 6.49 6.39 7.16 ...
##  $ age    : num  96.2 100 100 7.8 92.2 71.3 100 100 38.1 19.1 ...
##  $ dis    : num  2.05 1.32 1.98 7.04 2.7 ...
##  $ rad    : int  5 5 24 6 3 5 24 24 5 1 ...
##  $ tax    : int  403 403 666 300 193 384 666 666 224 315 ...
##  $ ptratio: num  14.7 14.7 20.2 16.6 17.8 20.9 20.2 20.2 20.2 16.4 ...
##  $ black  : num  369 397 387 375 394 ...
##  $ lstat  : num  3.7 26.82 18.85 5.19 4.82 ...
##  $ medv   : num  50 13.4 15.4 23.7 37.9 26.5 5 7 22.2 20.9 ...
##  $ target : int  1 1 1 0 0 0 1 1 0 0 ...
dim(crime)
## [1] 466  14
kable(summary(crime))
zn indus chas nox rm age dis rad tax ptratio black lstat medv target
Min. : 0.00 Min. : 0.460 Min. :0.00000 Min. :0.3890 Min. :3.863 Min. : 2.90 Min. : 1.130 Min. : 1.00 Min. :187.0 Min. :12.6 Min. : 0.32 Min. : 1.730 Min. : 5.00 Min. :0.0000
1st Qu.: 0.00 1st Qu.: 5.145 1st Qu.:0.00000 1st Qu.:0.4480 1st Qu.:5.887 1st Qu.: 43.88 1st Qu.: 2.101 1st Qu.: 4.00 1st Qu.:281.0 1st Qu.:16.9 1st Qu.:375.61 1st Qu.: 7.043 1st Qu.:17.02 1st Qu.:0.0000
Median : 0.00 Median : 9.690 Median :0.00000 Median :0.5380 Median :6.210 Median : 77.15 Median : 3.191 Median : 5.00 Median :334.5 Median :18.9 Median :391.34 Median :11.350 Median :21.20 Median :0.0000
Mean : 11.58 Mean :11.105 Mean :0.07082 Mean :0.5543 Mean :6.291 Mean : 68.37 Mean : 3.796 Mean : 9.53 Mean :409.5 Mean :18.4 Mean :357.12 Mean :12.631 Mean :22.59 Mean :0.4914
3rd Qu.: 16.25 3rd Qu.:18.100 3rd Qu.:0.00000 3rd Qu.:0.6240 3rd Qu.:6.630 3rd Qu.: 94.10 3rd Qu.: 5.215 3rd Qu.:24.00 3rd Qu.:666.0 3rd Qu.:20.2 3rd Qu.:396.24 3rd Qu.:16.930 3rd Qu.:25.00 3rd Qu.:1.0000
Max. :100.00 Max. :27.740 Max. :1.00000 Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.00 Max. :711.0 Max. :22.0 Max. :396.90 Max. :37.970 Max. :50.00 Max. :1.0000
missmap(crime, main = "Missing values vs observed")

THE ANALYZES

Pairing and fitting the general linear model of Target as a response variance, while other variables serves as an explanatory variable (Independent variable)

pairs(crime, col=crime$target)

fit <- glm(target ~., data = crime) 

A summary of the output

summary(fit)
## 
## Call:
## glm(formula = target ~ ., data = crime)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.58798  -0.21065  -0.04792   0.14512   0.88772  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.5070385  0.3679705  -4.096 4.99e-05 ***
## zn          -0.0009964  0.0009441  -1.055 0.291810    
## indus        0.0029792  0.0042907   0.694 0.487828    
## chas         0.0082079  0.0588428   0.139 0.889126    
## nox          1.9577915  0.2634246   7.432 5.39e-13 ***
## rm           0.0197145  0.0317996   0.620 0.535596    
## age          0.0032403  0.0009058   3.577 0.000385 ***
## dis          0.0132690  0.0141501   0.938 0.348886    
## rad          0.0199855  0.0043778   4.565 6.44e-06 ***
## tax         -0.0002780  0.0002615  -1.063 0.288451    
## ptratio      0.0123980  0.0093702   1.323 0.186462    
## black       -0.0002195  0.0001845  -1.190 0.234800    
## lstat        0.0042350  0.0038975   1.087 0.277798    
## medv         0.0095325  0.0030411   3.135 0.001833 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.0972825)
## 
##     Null deviance: 116.466  on 465  degrees of freedom
## Residual deviance:  43.972  on 452  degrees of freedom
## AIC: 252.39
## 
## Number of Fisher Scoring iterations: 2
par(mfrow=c(2,2))
plot(fit)

An introduction of Logistics regression for a better results

crimetarget <- glm(target~., family=binomial(link='logit'),data=crime)


summary(crimetarget)
## 
## Call:
## glm(formula = target ~ ., family = binomial(link = "logit"), 
##     data = crime)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2854  -0.1372  -0.0017   0.0020   3.4721  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -36.839521   7.028726  -5.241 1.59e-07 ***
## zn           -0.061720   0.034410  -1.794 0.072868 .  
## indus        -0.072580   0.048546  -1.495 0.134894    
## chas          1.032352   0.759627   1.359 0.174139    
## nox          50.159513   8.049503   6.231 4.62e-10 ***
## rm           -0.692145   0.741431  -0.934 0.350548    
## age           0.034522   0.013883   2.487 0.012895 *  
## dis           0.765795   0.234407   3.267 0.001087 ** 
## rad           0.663015   0.165135   4.015 5.94e-05 ***
## tax          -0.006593   0.003064  -2.152 0.031422 *  
## ptratio       0.442217   0.132234   3.344 0.000825 ***
## black        -0.013094   0.006680  -1.960 0.049974 *  
## lstat         0.047571   0.054508   0.873 0.382802    
## medv          0.199734   0.071022   2.812 0.004919 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 186.15  on 452  degrees of freedom
## AIC: 214.15
## 
## Number of Fisher Scoring iterations: 9
par(mfrow=c(2,2))
plot(crimetarget)

Although, variables like zn, chas and lstat are not statistically significance due to their p-value being greater than statiscally accepted p-value of 0.5, we can still proceed with our analysis and make some prediction.

Intrepretations:

Note that the Null deviance is 645.88, which implies that if all other parameters are held constant(control or not included), the estimate would be 645.88, while the Residual deviance of 186.15 means with the inclusion of other estimator, we expect the deviance to be 186.14.

*NB: The greater the difference between the Null deviance and Residual deviance, the better.

anova(crimetarget, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: target
## 
## Terms added sequentially (first to last)
## 
## 
##         Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                      465     645.88              
## zn       1  127.411       464     518.46 < 2.2e-16 ***
## indus    1   86.433       463     432.03 < 2.2e-16 ***
## chas     1    1.274       462     430.76  0.258981    
## nox      1  150.804       461     279.95 < 2.2e-16 ***
## rm       1    6.755       460     273.20  0.009349 ** 
## age      1    0.217       459     272.98  0.641515    
## dis      1    7.981       458     265.00  0.004727 ** 
## rad      1   53.018       457     211.98 3.305e-13 ***
## tax      1    5.562       456     206.42  0.018355 *  
## ptratio  1    5.657       455     200.76  0.017388 *  
## black    1    4.558       454     196.21  0.032774 *  
## lstat    1    0.916       453     195.29  0.338509    
## medv     1    9.144       452     186.15  0.002496 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The Analysis of Variance above depicts that the “Zn”, “indus”, “rad” and “noz” contributed significantly to the increment in crime rate in this city under analysis.

pred <- predict(crimetarget, type="response")
pred2 <- prediction(pred, crime$target)
pred3 <- performance(pred2, measure = "tpr", x.measure = "fpr")
plot(pred3)

Above is the plot for Sensitivity and Specitivity for the city target, while the value below is it AUC.

auc <- performance(pred2, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.9752732

Predictions and Accuracy.

target_predicts <- predict(crimetarget,newdata=subset(crime,select=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14)),type='response')
target_predicts <- ifelse(target_predicts > 0.5,1,0)

attach(crime)

table(target_predicts, target)
##                target
## target_predicts   0   1
##               0 222  20
##               1  15 209
misClasificError <- mean(target_predicts != target)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.924892703862661"

CHAS as a response variable.

fit2 <- glm(chas ~., data = crime)
summary(fit2)
## 
## Call:
## glm(formula = chas ~ ., data = crime)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.32800  -0.08801  -0.04722  -0.01367   0.94761  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -8.246e-02  2.995e-01  -0.275  0.78320   
## zn          -7.013e-05  7.556e-04  -0.093  0.92609   
## indus        7.985e-03  3.411e-03   2.341  0.01966 * 
## nox          3.113e-01  2.226e-01   1.399  0.16257   
## rm          -9.655e-03  2.543e-02  -0.380  0.70430   
## age          5.260e-04  7.338e-04   0.717  0.47389   
## dis          6.954e-03  1.132e-02   0.614  0.53922   
## rad          5.744e-03  3.569e-03   1.609  0.10822   
## tax         -4.981e-04  2.080e-04  -2.395  0.01704 * 
## ptratio     -7.581e-03  7.496e-03  -1.011  0.31242   
## black        1.005e-04  1.476e-04   0.681  0.49628   
## lstat       -1.557e-04  3.119e-03  -0.050  0.96021   
## medv         6.349e-03  2.439e-03   2.603  0.00954 **
## target       5.244e-03  3.760e-02   0.139  0.88913   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.06215705)
## 
##     Null deviance: 30.663  on 465  degrees of freedom
## Residual deviance: 28.095  on 452  degrees of freedom
## AIC: 43.646
## 
## Number of Fisher Scoring iterations: 2
par(mfrow=c(2,2))
plot(fit2)

The Above depicts that on “indus”, “tax” and “medv” are staistically significance. We will therefore explore another option: The Logit.

crimechas <- glm(chas~., family=binomial(link='logit'),data=crime)
summary(crimechas)
## 
## Call:
## glm(formula = chas ~ ., family = binomial(link = "logit"), data = crime)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0847  -0.3618  -0.2809  -0.1821   2.4811  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -3.308139   5.026203  -0.658  0.51042   
## zn          -0.003070   0.013977  -0.220  0.82617   
## indus        0.111655   0.046360   2.408  0.01602 * 
## nox          4.046547   3.249091   1.245  0.21297   
## rm          -0.130162   0.359220  -0.362  0.71709   
## age          0.003068   0.012781   0.240  0.81030   
## dis          0.057929   0.212968   0.272  0.78561   
## rad          0.179719   0.082394   2.181  0.02917 * 
## tax         -0.012178   0.004575  -2.662  0.00777 **
## ptratio     -0.114626   0.133580  -0.858  0.39083   
## black        0.003234   0.003488   0.927  0.35380   
## lstat       -0.010744   0.049840  -0.216  0.82933   
## medv         0.059348   0.032227   1.842  0.06553 . 
## target       0.384437   0.670299   0.574  0.56629   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 238.35  on 465  degrees of freedom
## Residual deviance: 203.15  on 452  degrees of freedom
## AIC: 231.15
## 
## Number of Fisher Scoring iterations: 6
plot(crimechas)

On the case of chas a response variable, the Null and Residual deviance are very close. The “Tax”, “rad” and “Indus” have strong association with the crime rate.

anova(crimechas, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: chas
## 
## Terms added sequentially (first to last)
## 
## 
##         Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                      465     238.35            
## zn       1   0.8418       464     237.51 0.358879   
## indus    1   0.9467       463     236.56 0.330572   
## nox      1   2.4413       462     234.12 0.118180   
## rm       1   6.6672       461     227.46 0.009821 **
## age      1   0.0579       460     227.40 0.809840   
## dis      1   0.5268       459     226.87 0.467938   
## rad      1   3.3225       458     223.55 0.068339 . 
## tax      1  10.0287       457     213.52 0.001541 **
## ptratio  1   2.9486       456     210.57 0.085953 . 
## black    1   2.1110       455     208.46 0.146247   
## lstat    1   1.5804       454     206.88 0.208706   
## medv     1   3.4047       453     203.47 0.065012 . 
## target   1   0.3245       452     203.15 0.568911   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

After the Analysis of variance, we see that only “tax” contribute slightly to the crime rate.

predicts <- predict(crimechas, type="response")
predicts2 <- prediction(predicts, crime$chas)
predicts3 <- performance(predicts2, measure = "tpr", x.measure = "fpr")
plot(predicts3)

Above is the Sensitivity and Specitivity plot, while the AUC value is shown below.

auc2 <- performance(predicts2, measure = "auc")
auc2 <- auc2@y.values[[1]]
auc2
## [1] 0.8426762

Below is the prediction for Chas and its Accuracy

chas_predicts <- predict(crimechas,newdata=subset(crime,select=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14)),type='response')
chas_predicts <- ifelse(chas_predicts > 0.5,1,0)

attach(crime)
## The following objects are masked from crime (pos = 3):
## 
##     age, black, chas, dis, indus, lstat, medv, nox, ptratio, rad,
##     rm, target, tax, zn
table(chas_predicts, chas)
##              chas
## chas_predicts   0   1
##             0 433  33
misClasificError <- mean(chas_predicts != chas)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.929184549356223"
t <- table(crime[ , c(3,14)])
colnames(t) <- c('Real Positive', 'Real Negative')
rownames(t) <- c('Model Positive', 'Model Negative')
t
##                 target
## chas             Real Positive Real Negative
##   Model Positive           225           208
##   Model Negative            12            21
getConfusionMatrix <- function(df) {
  t <- table(df$chas,df$target)
  colnames(t) <- c('Real Positive', 'Real Negative')
  rownames(t) <- c('Model Positive', 'Model Negative')

  return(t)
}

getAccuracy <- function(df) {
  cm <- getConfusionMatrix(df)

  tn <- cm["Model Negative", "Real Negative"]
  fp <- cm["Model Positive", "Real Negative"]
  fn <- cm["Model Negative", "Real Positive"]
  tp <- cm["Model Positive", "Real Positive"]

  accuracy <- (tn+tp) / (tn+fp+fn+tp)
  return(accuracy)
}

getCER <- function(df) {
  cm <- getConfusionMatrix(df)

  tn <- cm["Model Negative", "Real Negative"]
  fp <- cm["Model Positive", "Real Negative"]
  fn <- cm["Model Negative", "Real Positive"]
  tp <- cm["Model Positive", "Real Positive"]

  cer <- (fp + fn) / (tn+fp+fn+tp)
  return(cer)
}

getPrecision <- function(df) {
  cm <- getConfusionMatrix(df)

  tn <- cm["Model Negative", "Real Negative"]
  fp <- cm["Model Positive", "Real Negative"]
  fn <- cm["Model Negative", "Real Positive"]
  tp <- cm["Model Positive", "Real Positive"]

  precision <- tp / (fp+tp)
  return(precision)

}

getSensitivity <- function(df){
  cm <- getConfusionMatrix(df)

  tn <- cm["Model Negative", "Real Negative"]
  fp <- cm["Model Positive", "Real Negative"]
  fn <- cm["Model Negative", "Real Positive"]
  tp <- cm["Model Positive", "Real Positive"]

  sensitivity <- tp/(tp+fn)

  return(sensitivity)
}

getSpecificity <- function(df){
  cm <- getConfusionMatrix(df)

  tn <- cm["Model Negative", "Real Negative"]
  fp <- cm["Model Positive", "Real Negative"]
  fn <- cm["Model Negative", "Real Positive"]
  tp <- cm["Model Positive", "Real Positive"]

  specificity <- tn / (tn+fp) 
  return(specificity)
}

getF1Score <- function(df) {

  cm <- getConfusionMatrix(df)

  tn <- cm["Model Negative", "Real Negative"]
  fp <- cm["Model Positive", "Real Negative"]
  fn <- cm["Model Negative", "Real Positive"]
  tp <- cm["Model Positive", "Real Positive"]

  precision <- getPrecision(df)
  sensitivity <- getSensitivity(df)
  f1score <- 2* (precision * sensitivity) / (precision+sensitivity)

  return(f1score)

}

Confusion Matrix

getConfusionMatrix(crime)

Accuracy

getAccuracy(crime)
## [1] 0.527897

CER

getCER(crime)
## [1] 0.472103

Precision

getPrecision(crime)
## [1] 0.5196305

F1Score

getF1Score(crime)
## [1] 0.6716418

Specificity

getSpecificity(crime)
## [1] 0.09170306

Sensitivity

getSensitivity(crime);
## [1] 0.9493671