Data 621 Week Update

options(warn = -1)

suppressMessages(require(plotly))
library(knitr)
suppressMessages(library(RCurl))
suppressMessages(library(plyr))
suppressMessages(library(ggplot2))
suppressMessages(library(plotly))
suppressMessages(require(scatterplot3d))
suppressMessages(library(Amelia))
suppressMessages(library(ROCR))
suppressMessages(library(pROC));

crime <- read.csv("https://raw.githubusercontent.com/mascotinme/MSDA-IS621/master/crime-training-data.csv", header = TRUE, sep = ",")

crime_evaluation <- read.csv("https://raw.githubusercontent.com/mascotinme/MSDA-IS621/master/crime-evaluation-data.csv", header = TRUE, sep = ",")

Data Exploration:

Inferential Statistics

names(crime)

##  [1] "zn"      "indus"   "chas"    "nox"     "rm"      "age"     "dis"    
##  [8] "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"    "target"

str(crime)

## 'data.frame':    466 obs. of  14 variables:
##  $ zn     : num  0 0 0 30 0 0 0 0 0 80 ...
##  $ indus  : num  19.58 19.58 18.1 4.93 2.46 ...
##  $ chas   : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.605 0.871 0.74 0.428 0.488 0.52 0.693 0.693 0.515 0.392 ...
##  $ rm     : num  7.93 5.4 6.49 6.39 7.16 ...
##  $ age    : num  96.2 100 100 7.8 92.2 71.3 100 100 38.1 19.1 ...
##  $ dis    : num  2.05 1.32 1.98 7.04 2.7 ...
##  $ rad    : int  5 5 24 6 3 5 24 24 5 1 ...
##  $ tax    : int  403 403 666 300 193 384 666 666 224 315 ...
##  $ ptratio: num  14.7 14.7 20.2 16.6 17.8 20.9 20.2 20.2 20.2 16.4 ...
##  $ black  : num  369 397 387 375 394 ...
##  $ lstat  : num  3.7 26.82 18.85 5.19 4.82 ...
##  $ medv   : num  50 13.4 15.4 23.7 37.9 26.5 5 7 22.2 20.9 ...
##  $ target : int  1 1 1 0 0 0 1 1 0 0 ...

dim(crime)

## [1] 466  14

kable(summary(crime))

zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv	target
Min. : 0.00	Min. : 0.460	Min. :0.00000	Min. :0.3890	Min. :3.863	Min. : 2.90	Min. : 1.130	Min. : 1.00	Min. :187.0	Min. :12.6	Min. : 0.32	Min. : 1.730	Min. : 5.00	Min. :0.0000
1st Qu.: 0.00	1st Qu.: 5.145	1st Qu.:0.00000	1st Qu.:0.4480	1st Qu.:5.887	1st Qu.: 43.88	1st Qu.: 2.101	1st Qu.: 4.00	1st Qu.:281.0	1st Qu.:16.9	1st Qu.:375.61	1st Qu.: 7.043	1st Qu.:17.02	1st Qu.:0.0000
Median : 0.00	Median : 9.690	Median :0.00000	Median :0.5380	Median :6.210	Median : 77.15	Median : 3.191	Median : 5.00	Median :334.5	Median :18.9	Median :391.34	Median :11.350	Median :21.20	Median :0.0000
Mean : 11.58	Mean :11.105	Mean :0.07082	Mean :0.5543	Mean :6.291	Mean : 68.37	Mean : 3.796	Mean : 9.53	Mean :409.5	Mean :18.4	Mean :357.12	Mean :12.631	Mean :22.59	Mean :0.4914
3rd Qu.: 16.25	3rd Qu.:18.100	3rd Qu.:0.00000	3rd Qu.:0.6240	3rd Qu.:6.630	3rd Qu.: 94.10	3rd Qu.: 5.215	3rd Qu.:24.00	3rd Qu.:666.0	3rd Qu.:20.2	3rd Qu.:396.24	3rd Qu.:16.930	3rd Qu.:25.00	3rd Qu.:1.0000
Max. :100.00	Max. :27.740	Max. :1.00000	Max. :0.8710	Max. :8.780	Max. :100.00	Max. :12.127	Max. :24.00	Max. :711.0	Max. :22.0	Max. :396.90	Max. :37.970	Max. :50.00	Max. :1.0000

missmap(crime, main = "Missing values vs observed")

The crime dataset contains 14 variables, with 466 observations.
There are no missing values.
The Minimum, Quatiles and Maximum values.

THE ANALYZES

Pairing and fitting the general linear model of Target as a response variance, while other variables serves as an explanatory variable (Independent variable)

pairs(crime, col=crime$target)

fit <- glm(target ~., data = crime)

A summary of the output

summary(fit)

## 
## Call:
## glm(formula = target ~ ., data = crime)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.58798  -0.21065  -0.04792   0.14512   0.88772  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.5070385  0.3679705  -4.096 4.99e-05 ***
## zn          -0.0009964  0.0009441  -1.055 0.291810    
## indus        0.0029792  0.0042907   0.694 0.487828    
## chas         0.0082079  0.0588428   0.139 0.889126    
## nox          1.9577915  0.2634246   7.432 5.39e-13 ***
## rm           0.0197145  0.0317996   0.620 0.535596    
## age          0.0032403  0.0009058   3.577 0.000385 ***
## dis          0.0132690  0.0141501   0.938 0.348886    
## rad          0.0199855  0.0043778   4.565 6.44e-06 ***
## tax         -0.0002780  0.0002615  -1.063 0.288451    
## ptratio      0.0123980  0.0093702   1.323 0.186462    
## black       -0.0002195  0.0001845  -1.190 0.234800    
## lstat        0.0042350  0.0038975   1.087 0.277798    
## medv         0.0095325  0.0030411   3.135 0.001833 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.0972825)
## 
##     Null deviance: 116.466  on 465  degrees of freedom
## Residual deviance:  43.972  on 452  degrees of freedom
## AIC: 252.39
## 
## Number of Fisher Scoring iterations: 2

par(mfrow=c(2,2))
plot(fit)

An introduction of Logistics regression for a better results

crimetarget <- glm(target~., family=binomial(link='logit'),data=crime)


summary(crimetarget)

## 
## Call:
## glm(formula = target ~ ., family = binomial(link = "logit"), 
##     data = crime)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2854  -0.1372  -0.0017   0.0020   3.4721  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -36.839521   7.028726  -5.241 1.59e-07 ***
## zn           -0.061720   0.034410  -1.794 0.072868 .  
## indus        -0.072580   0.048546  -1.495 0.134894    
## chas          1.032352   0.759627   1.359 0.174139    
## nox          50.159513   8.049503   6.231 4.62e-10 ***
## rm           -0.692145   0.741431  -0.934 0.350548    
## age           0.034522   0.013883   2.487 0.012895 *  
## dis           0.765795   0.234407   3.267 0.001087 ** 
## rad           0.663015   0.165135   4.015 5.94e-05 ***
## tax          -0.006593   0.003064  -2.152 0.031422 *  
## ptratio       0.442217   0.132234   3.344 0.000825 ***
## black        -0.013094   0.006680  -1.960 0.049974 *  
## lstat         0.047571   0.054508   0.873 0.382802    
## medv          0.199734   0.071022   2.812 0.004919 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 186.15  on 452  degrees of freedom
## AIC: 214.15
## 
## Number of Fisher Scoring iterations: 9

par(mfrow=c(2,2))
plot(crimetarget)

Although, variables like zn, chas and lstat are not statistically significance due to their p-value being greater than statiscally accepted p-value of 0.5, we can still proceed with our analysis and make some prediction.

Intrepretations:

Note that the Null deviance is 645.88, which implies that if all other parameters are held constant(control or not included), the estimate would be 645.88, while the Residual deviance of 186.15 means with the inclusion of other estimator, we expect the deviance to be 186.14.

*NB: The greater the difference between the Null deviance and Residual deviance, the better.

anova(crimetarget, test="Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: target
## 
## Terms added sequentially (first to last)
## 
## 
##         Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                      465     645.88              
## zn       1  127.411       464     518.46 < 2.2e-16 ***
## indus    1   86.433       463     432.03 < 2.2e-16 ***
## chas     1    1.274       462     430.76  0.258981    
## nox      1  150.804       461     279.95 < 2.2e-16 ***
## rm       1    6.755       460     273.20  0.009349 ** 
## age      1    0.217       459     272.98  0.641515    
## dis      1    7.981       458     265.00  0.004727 ** 
## rad      1   53.018       457     211.98 3.305e-13 ***
## tax      1    5.562       456     206.42  0.018355 *  
## ptratio  1    5.657       455     200.76  0.017388 *  
## black    1    4.558       454     196.21  0.032774 *  
## lstat    1    0.916       453     195.29  0.338509    
## medv     1    9.144       452     186.15  0.002496 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The Analysis of Variance above depicts that the “Zn”, “indus”, “rad” and “noz” contributed significantly to the increment in crime rate in this city under analysis.

pred <- predict(crimetarget, type="response")
pred2 <- prediction(pred, crime$target)
pred3 <- performance(pred2, measure = "tpr", x.measure = "fpr")
plot(pred3)

Above is the plot for Sensitivity and Specitivity for the city target, while the value below is it AUC.

auc <- performance(pred2, measure = "auc")
auc <- auc@y.values[[1]]
auc

## [1] 0.9752732

Predictions and Accuracy.

target_predicts <- predict(crimetarget,newdata=subset(crime,select=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14)),type='response')
target_predicts <- ifelse(target_predicts > 0.5,1,0)

attach(crime)

table(target_predicts, target)

##                target
## target_predicts   0   1
##               0 222  20
##               1  15 209

misClasificError <- mean(target_predicts != target)
print(paste('Accuracy',1-misClasificError))

## [1] "Accuracy 0.924892703862661"

CHAS as a response variable.

fit2 <- glm(chas ~., data = crime)
summary(fit2)

## 
## Call:
## glm(formula = chas ~ ., data = crime)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.32800  -0.08801  -0.04722  -0.01367   0.94761  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -8.246e-02  2.995e-01  -0.275  0.78320   
## zn          -7.013e-05  7.556e-04  -0.093  0.92609   
## indus        7.985e-03  3.411e-03   2.341  0.01966 * 
## nox          3.113e-01  2.226e-01   1.399  0.16257   
## rm          -9.655e-03  2.543e-02  -0.380  0.70430   
## age          5.260e-04  7.338e-04   0.717  0.47389   
## dis          6.954e-03  1.132e-02   0.614  0.53922   
## rad          5.744e-03  3.569e-03   1.609  0.10822   
## tax         -4.981e-04  2.080e-04  -2.395  0.01704 * 
## ptratio     -7.581e-03  7.496e-03  -1.011  0.31242   
## black        1.005e-04  1.476e-04   0.681  0.49628   
## lstat       -1.557e-04  3.119e-03  -0.050  0.96021   
## medv         6.349e-03  2.439e-03   2.603  0.00954 **
## target       5.244e-03  3.760e-02   0.139  0.88913   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.06215705)
## 
##     Null deviance: 30.663  on 465  degrees of freedom
## Residual deviance: 28.095  on 452  degrees of freedom
## AIC: 43.646
## 
## Number of Fisher Scoring iterations: 2

par(mfrow=c(2,2))
plot(fit2)

The Above depicts that on “indus”, “tax” and “medv” are staistically significance. We will therefore explore another option: The Logit.

crimechas <- glm(chas~., family=binomial(link='logit'),data=crime)
summary(crimechas)

## 
## Call:
## glm(formula = chas ~ ., family = binomial(link = "logit"), data = crime)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0847  -0.3618  -0.2809  -0.1821   2.4811  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -3.308139   5.026203  -0.658  0.51042   
## zn          -0.003070   0.013977  -0.220  0.82617   
## indus        0.111655   0.046360   2.408  0.01602 * 
## nox          4.046547   3.249091   1.245  0.21297   
## rm          -0.130162   0.359220  -0.362  0.71709   
## age          0.003068   0.012781   0.240  0.81030   
## dis          0.057929   0.212968   0.272  0.78561   
## rad          0.179719   0.082394   2.181  0.02917 * 
## tax         -0.012178   0.004575  -2.662  0.00777 **
## ptratio     -0.114626   0.133580  -0.858  0.39083   
## black        0.003234   0.003488   0.927  0.35380   
## lstat       -0.010744   0.049840  -0.216  0.82933   
## medv         0.059348   0.032227   1.842  0.06553 . 
## target       0.384437   0.670299   0.574  0.56629   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 238.35  on 465  degrees of freedom
## Residual deviance: 203.15  on 452  degrees of freedom
## AIC: 231.15
## 
## Number of Fisher Scoring iterations: 6

plot(crimechas)

On the case of chas a response variable, the Null and Residual deviance are very close. The “Tax”, “rad” and “Indus” have strong association with the crime rate.

anova(crimechas, test="Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: chas
## 
## Terms added sequentially (first to last)
## 
## 
##         Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                      465     238.35            
## zn       1   0.8418       464     237.51 0.358879   
## indus    1   0.9467       463     236.56 0.330572   
## nox      1   2.4413       462     234.12 0.118180   
## rm       1   6.6672       461     227.46 0.009821 **
## age      1   0.0579       460     227.40 0.809840   
## dis      1   0.5268       459     226.87 0.467938   
## rad      1   3.3225       458     223.55 0.068339 . 
## tax      1  10.0287       457     213.52 0.001541 **
## ptratio  1   2.9486       456     210.57 0.085953 . 
## black    1   2.1110       455     208.46 0.146247   
## lstat    1   1.5804       454     206.88 0.208706   
## medv     1   3.4047       453     203.47 0.065012 . 
## target   1   0.3245       452     203.15 0.568911   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

After the Analysis of variance, we see that only “tax” contribute slightly to the crime rate.

predicts <- predict(crimechas, type="response")
predicts2 <- prediction(predicts, crime$chas)
predicts3 <- performance(predicts2, measure = "tpr", x.measure = "fpr")
plot(predicts3)

Above is the Sensitivity and Specitivity plot, while the AUC value is shown below.

auc2 <- performance(predicts2, measure = "auc")
auc2 <- auc2@y.values[[1]]
auc2

## [1] 0.8426762

Below is the prediction for Chas and its Accuracy

chas_predicts <- predict(crimechas,newdata=subset(crime,select=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14)),type='response')
chas_predicts <- ifelse(chas_predicts > 0.5,1,0)

attach(crime)

## The following objects are masked from crime (pos = 3):
## 
##     age, black, chas, dis, indus, lstat, medv, nox, ptratio, rad,
##     rm, target, tax, zn

table(chas_predicts, chas)

##              chas
## chas_predicts   0   1
##             0 433  33

misClasificError <- mean(chas_predicts != chas)
print(paste('Accuracy',1-misClasificError))

## [1] "Accuracy 0.929184549356223"

t <- table(crime[ , c(3,14)])
colnames(t) <- c('Real Positive', 'Real Negative')
rownames(t) <- c('Model Positive', 'Model Negative')
t

##                 target
## chas             Real Positive Real Negative
##   Model Positive           225           208
##   Model Negative            12            21

getConfusionMatrix <- function(df) {
  t <- table(df$chas,df$target)
  colnames(t) <- c('Real Positive', 'Real Negative')
  rownames(t) <- c('Model Positive', 'Model Negative')

  return(t)
}

getAccuracy <- function(df) {
  cm <- getConfusionMatrix(df)

  tn <- cm["Model Negative", "Real Negative"]
  fp <- cm["Model Positive", "Real Negative"]
  fn <- cm["Model Negative", "Real Positive"]
  tp <- cm["Model Positive", "Real Positive"]

  accuracy <- (tn+tp) / (tn+fp+fn+tp)
  return(accuracy)
}

getCER <- function(df) {
  cm <- getConfusionMatrix(df)

  tn <- cm["Model Negative", "Real Negative"]
  fp <- cm["Model Positive", "Real Negative"]
  fn <- cm["Model Negative", "Real Positive"]
  tp <- cm["Model Positive", "Real Positive"]

  cer <- (fp + fn) / (tn+fp+fn+tp)
  return(cer)
}

getPrecision <- function(df) {
  cm <- getConfusionMatrix(df)

  tn <- cm["Model Negative", "Real Negative"]
  fp <- cm["Model Positive", "Real Negative"]
  fn <- cm["Model Negative", "Real Positive"]
  tp <- cm["Model Positive", "Real Positive"]

  precision <- tp / (fp+tp)
  return(precision)

}

getSensitivity <- function(df){
  cm <- getConfusionMatrix(df)

  tn <- cm["Model Negative", "Real Negative"]
  fp <- cm["Model Positive", "Real Negative"]
  fn <- cm["Model Negative", "Real Positive"]
  tp <- cm["Model Positive", "Real Positive"]

  sensitivity <- tp/(tp+fn)

  return(sensitivity)
}

getSpecificity <- function(df){
  cm <- getConfusionMatrix(df)

  tn <- cm["Model Negative", "Real Negative"]
  fp <- cm["Model Positive", "Real Negative"]
  fn <- cm["Model Negative", "Real Positive"]
  tp <- cm["Model Positive", "Real Positive"]

  specificity <- tn / (tn+fp) 
  return(specificity)
}

getF1Score <- function(df) {

  cm <- getConfusionMatrix(df)

  tn <- cm["Model Negative", "Real Negative"]
  fp <- cm["Model Positive", "Real Negative"]
  fn <- cm["Model Negative", "Real Positive"]
  tp <- cm["Model Positive", "Real Positive"]

  precision <- getPrecision(df)
  sensitivity <- getSensitivity(df)
  f1score <- 2* (precision * sensitivity) / (precision+sensitivity)

  return(f1score)

}

Confusion Matrix

getConfusionMatrix(crime)

Accuracy

getAccuracy(crime)

## [1] 0.527897

CER

getCER(crime)

## [1] 0.472103

Precision

getPrecision(crime)

## [1] 0.5196305

F1Score

getF1Score(crime)

## [1] 0.6716418

Specificity

getSpecificity(crime)

## [1] 0.09170306

Sensitivity

getSensitivity(crime);

## [1] 0.9493671

Data 621 Week Update

MUSA T. GANIYU

July 2, 2016