options(warn = -1)
suppressMessages(require(plotly))
library(knitr)
suppressMessages(library(RCurl))
suppressMessages(library(plyr))
suppressMessages(library(ggplot2))
suppressMessages(library(plotly))
suppressMessages(require(scatterplot3d))
suppressMessages(library(Amelia))
suppressMessages(library(ROCR))
suppressMessages(library(pROC));
crime <- read.csv("https://raw.githubusercontent.com/mascotinme/MSDA-IS621/master/crime-training-data.csv", header = TRUE, sep = ",")
crime_evaluation <- read.csv("https://raw.githubusercontent.com/mascotinme/MSDA-IS621/master/crime-evaluation-data.csv", header = TRUE, sep = ",")Data Exploration:
Inferential Statistics
names(crime)## [1] "zn" "indus" "chas" "nox" "rm" "age" "dis"
## [8] "rad" "tax" "ptratio" "black" "lstat" "medv" "target"
str(crime)## 'data.frame': 466 obs. of 14 variables:
## $ zn : num 0 0 0 30 0 0 0 0 0 80 ...
## $ indus : num 19.58 19.58 18.1 4.93 2.46 ...
## $ chas : int 0 1 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.605 0.871 0.74 0.428 0.488 0.52 0.693 0.693 0.515 0.392 ...
## $ rm : num 7.93 5.4 6.49 6.39 7.16 ...
## $ age : num 96.2 100 100 7.8 92.2 71.3 100 100 38.1 19.1 ...
## $ dis : num 2.05 1.32 1.98 7.04 2.7 ...
## $ rad : int 5 5 24 6 3 5 24 24 5 1 ...
## $ tax : int 403 403 666 300 193 384 666 666 224 315 ...
## $ ptratio: num 14.7 14.7 20.2 16.6 17.8 20.9 20.2 20.2 20.2 16.4 ...
## $ black : num 369 397 387 375 394 ...
## $ lstat : num 3.7 26.82 18.85 5.19 4.82 ...
## $ medv : num 50 13.4 15.4 23.7 37.9 26.5 5 7 22.2 20.9 ...
## $ target : int 1 1 1 0 0 0 1 1 0 0 ...
dim(crime)## [1] 466 14
kable(summary(crime))| zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. : 0.00 | Min. : 0.460 | Min. :0.00000 | Min. :0.3890 | Min. :3.863 | Min. : 2.90 | Min. : 1.130 | Min. : 1.00 | Min. :187.0 | Min. :12.6 | Min. : 0.32 | Min. : 1.730 | Min. : 5.00 | Min. :0.0000 | |
| 1st Qu.: 0.00 | 1st Qu.: 5.145 | 1st Qu.:0.00000 | 1st Qu.:0.4480 | 1st Qu.:5.887 | 1st Qu.: 43.88 | 1st Qu.: 2.101 | 1st Qu.: 4.00 | 1st Qu.:281.0 | 1st Qu.:16.9 | 1st Qu.:375.61 | 1st Qu.: 7.043 | 1st Qu.:17.02 | 1st Qu.:0.0000 | |
| Median : 0.00 | Median : 9.690 | Median :0.00000 | Median :0.5380 | Median :6.210 | Median : 77.15 | Median : 3.191 | Median : 5.00 | Median :334.5 | Median :18.9 | Median :391.34 | Median :11.350 | Median :21.20 | Median :0.0000 | |
| Mean : 11.58 | Mean :11.105 | Mean :0.07082 | Mean :0.5543 | Mean :6.291 | Mean : 68.37 | Mean : 3.796 | Mean : 9.53 | Mean :409.5 | Mean :18.4 | Mean :357.12 | Mean :12.631 | Mean :22.59 | Mean :0.4914 | |
| 3rd Qu.: 16.25 | 3rd Qu.:18.100 | 3rd Qu.:0.00000 | 3rd Qu.:0.6240 | 3rd Qu.:6.630 | 3rd Qu.: 94.10 | 3rd Qu.: 5.215 | 3rd Qu.:24.00 | 3rd Qu.:666.0 | 3rd Qu.:20.2 | 3rd Qu.:396.24 | 3rd Qu.:16.930 | 3rd Qu.:25.00 | 3rd Qu.:1.0000 | |
| Max. :100.00 | Max. :27.740 | Max. :1.00000 | Max. :0.8710 | Max. :8.780 | Max. :100.00 | Max. :12.127 | Max. :24.00 | Max. :711.0 | Max. :22.0 | Max. :396.90 | Max. :37.970 | Max. :50.00 | Max. :1.0000 |
missmap(crime, main = "Missing values vs observed")The crime dataset contains 14 variables, with 466 observations.
There are no missing values.
The Minimum, Quatiles and Maximum values.
THE ANALYZES
Pairing and fitting the general linear model of Target as a response variance, while other variables serves as an explanatory variable (Independent variable)
pairs(crime, col=crime$target)fit <- glm(target ~., data = crime) A summary of the output
summary(fit)##
## Call:
## glm(formula = target ~ ., data = crime)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.58798 -0.21065 -0.04792 0.14512 0.88772
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.5070385 0.3679705 -4.096 4.99e-05 ***
## zn -0.0009964 0.0009441 -1.055 0.291810
## indus 0.0029792 0.0042907 0.694 0.487828
## chas 0.0082079 0.0588428 0.139 0.889126
## nox 1.9577915 0.2634246 7.432 5.39e-13 ***
## rm 0.0197145 0.0317996 0.620 0.535596
## age 0.0032403 0.0009058 3.577 0.000385 ***
## dis 0.0132690 0.0141501 0.938 0.348886
## rad 0.0199855 0.0043778 4.565 6.44e-06 ***
## tax -0.0002780 0.0002615 -1.063 0.288451
## ptratio 0.0123980 0.0093702 1.323 0.186462
## black -0.0002195 0.0001845 -1.190 0.234800
## lstat 0.0042350 0.0038975 1.087 0.277798
## medv 0.0095325 0.0030411 3.135 0.001833 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.0972825)
##
## Null deviance: 116.466 on 465 degrees of freedom
## Residual deviance: 43.972 on 452 degrees of freedom
## AIC: 252.39
##
## Number of Fisher Scoring iterations: 2
par(mfrow=c(2,2))
plot(fit)An introduction of Logistics regression for a better results
crimetarget <- glm(target~., family=binomial(link='logit'),data=crime)
summary(crimetarget)##
## Call:
## glm(formula = target ~ ., family = binomial(link = "logit"),
## data = crime)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2854 -0.1372 -0.0017 0.0020 3.4721
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -36.839521 7.028726 -5.241 1.59e-07 ***
## zn -0.061720 0.034410 -1.794 0.072868 .
## indus -0.072580 0.048546 -1.495 0.134894
## chas 1.032352 0.759627 1.359 0.174139
## nox 50.159513 8.049503 6.231 4.62e-10 ***
## rm -0.692145 0.741431 -0.934 0.350548
## age 0.034522 0.013883 2.487 0.012895 *
## dis 0.765795 0.234407 3.267 0.001087 **
## rad 0.663015 0.165135 4.015 5.94e-05 ***
## tax -0.006593 0.003064 -2.152 0.031422 *
## ptratio 0.442217 0.132234 3.344 0.000825 ***
## black -0.013094 0.006680 -1.960 0.049974 *
## lstat 0.047571 0.054508 0.873 0.382802
## medv 0.199734 0.071022 2.812 0.004919 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 186.15 on 452 degrees of freedom
## AIC: 214.15
##
## Number of Fisher Scoring iterations: 9
par(mfrow=c(2,2))
plot(crimetarget)Although, variables like zn, chas and lstat are not statistically significance due to their p-value being greater than statiscally accepted p-value of 0.5, we can still proceed with our analysis and make some prediction.
Intrepretations:
Note that the Null deviance is 645.88, which implies that if all other parameters are held constant(control or not included), the estimate would be 645.88, while the Residual deviance of 186.15 means with the inclusion of other estimator, we expect the deviance to be 186.14.
*NB: The greater the difference between the Null deviance and Residual deviance, the better.
anova(crimetarget, test="Chisq")## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: target
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 465 645.88
## zn 1 127.411 464 518.46 < 2.2e-16 ***
## indus 1 86.433 463 432.03 < 2.2e-16 ***
## chas 1 1.274 462 430.76 0.258981
## nox 1 150.804 461 279.95 < 2.2e-16 ***
## rm 1 6.755 460 273.20 0.009349 **
## age 1 0.217 459 272.98 0.641515
## dis 1 7.981 458 265.00 0.004727 **
## rad 1 53.018 457 211.98 3.305e-13 ***
## tax 1 5.562 456 206.42 0.018355 *
## ptratio 1 5.657 455 200.76 0.017388 *
## black 1 4.558 454 196.21 0.032774 *
## lstat 1 0.916 453 195.29 0.338509
## medv 1 9.144 452 186.15 0.002496 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The Analysis of Variance above depicts that the “Zn”, “indus”, “rad” and “noz” contributed significantly to the increment in crime rate in this city under analysis.
pred <- predict(crimetarget, type="response")
pred2 <- prediction(pred, crime$target)
pred3 <- performance(pred2, measure = "tpr", x.measure = "fpr")
plot(pred3)Above is the plot for Sensitivity and Specitivity for the city target, while the value below is it AUC.
auc <- performance(pred2, measure = "auc")
auc <- auc@y.values[[1]]
auc## [1] 0.9752732
Predictions and Accuracy.
target_predicts <- predict(crimetarget,newdata=subset(crime,select=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14)),type='response')
target_predicts <- ifelse(target_predicts > 0.5,1,0)
attach(crime)
table(target_predicts, target)## target
## target_predicts 0 1
## 0 222 20
## 1 15 209
misClasificError <- mean(target_predicts != target)
print(paste('Accuracy',1-misClasificError))## [1] "Accuracy 0.924892703862661"
CHAS as a response variable.
fit2 <- glm(chas ~., data = crime)
summary(fit2)##
## Call:
## glm(formula = chas ~ ., data = crime)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.32800 -0.08801 -0.04722 -0.01367 0.94761
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.246e-02 2.995e-01 -0.275 0.78320
## zn -7.013e-05 7.556e-04 -0.093 0.92609
## indus 7.985e-03 3.411e-03 2.341 0.01966 *
## nox 3.113e-01 2.226e-01 1.399 0.16257
## rm -9.655e-03 2.543e-02 -0.380 0.70430
## age 5.260e-04 7.338e-04 0.717 0.47389
## dis 6.954e-03 1.132e-02 0.614 0.53922
## rad 5.744e-03 3.569e-03 1.609 0.10822
## tax -4.981e-04 2.080e-04 -2.395 0.01704 *
## ptratio -7.581e-03 7.496e-03 -1.011 0.31242
## black 1.005e-04 1.476e-04 0.681 0.49628
## lstat -1.557e-04 3.119e-03 -0.050 0.96021
## medv 6.349e-03 2.439e-03 2.603 0.00954 **
## target 5.244e-03 3.760e-02 0.139 0.88913
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.06215705)
##
## Null deviance: 30.663 on 465 degrees of freedom
## Residual deviance: 28.095 on 452 degrees of freedom
## AIC: 43.646
##
## Number of Fisher Scoring iterations: 2
par(mfrow=c(2,2))
plot(fit2)The Above depicts that on “indus”, “tax” and “medv” are staistically significance. We will therefore explore another option: The Logit.
crimechas <- glm(chas~., family=binomial(link='logit'),data=crime)
summary(crimechas)##
## Call:
## glm(formula = chas ~ ., family = binomial(link = "logit"), data = crime)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0847 -0.3618 -0.2809 -0.1821 2.4811
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.308139 5.026203 -0.658 0.51042
## zn -0.003070 0.013977 -0.220 0.82617
## indus 0.111655 0.046360 2.408 0.01602 *
## nox 4.046547 3.249091 1.245 0.21297
## rm -0.130162 0.359220 -0.362 0.71709
## age 0.003068 0.012781 0.240 0.81030
## dis 0.057929 0.212968 0.272 0.78561
## rad 0.179719 0.082394 2.181 0.02917 *
## tax -0.012178 0.004575 -2.662 0.00777 **
## ptratio -0.114626 0.133580 -0.858 0.39083
## black 0.003234 0.003488 0.927 0.35380
## lstat -0.010744 0.049840 -0.216 0.82933
## medv 0.059348 0.032227 1.842 0.06553 .
## target 0.384437 0.670299 0.574 0.56629
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 238.35 on 465 degrees of freedom
## Residual deviance: 203.15 on 452 degrees of freedom
## AIC: 231.15
##
## Number of Fisher Scoring iterations: 6
plot(crimechas)On the case of chas a response variable, the Null and Residual deviance are very close. The “Tax”, “rad” and “Indus” have strong association with the crime rate.
anova(crimechas, test="Chisq")## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: chas
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 465 238.35
## zn 1 0.8418 464 237.51 0.358879
## indus 1 0.9467 463 236.56 0.330572
## nox 1 2.4413 462 234.12 0.118180
## rm 1 6.6672 461 227.46 0.009821 **
## age 1 0.0579 460 227.40 0.809840
## dis 1 0.5268 459 226.87 0.467938
## rad 1 3.3225 458 223.55 0.068339 .
## tax 1 10.0287 457 213.52 0.001541 **
## ptratio 1 2.9486 456 210.57 0.085953 .
## black 1 2.1110 455 208.46 0.146247
## lstat 1 1.5804 454 206.88 0.208706
## medv 1 3.4047 453 203.47 0.065012 .
## target 1 0.3245 452 203.15 0.568911
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
After the Analysis of variance, we see that only “tax” contribute slightly to the crime rate.
predicts <- predict(crimechas, type="response")
predicts2 <- prediction(predicts, crime$chas)
predicts3 <- performance(predicts2, measure = "tpr", x.measure = "fpr")
plot(predicts3)Above is the Sensitivity and Specitivity plot, while the AUC value is shown below.
auc2 <- performance(predicts2, measure = "auc")
auc2 <- auc2@y.values[[1]]
auc2## [1] 0.8426762
Below is the prediction for Chas and its Accuracy
chas_predicts <- predict(crimechas,newdata=subset(crime,select=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14)),type='response')
chas_predicts <- ifelse(chas_predicts > 0.5,1,0)
attach(crime)## The following objects are masked from crime (pos = 3):
##
## age, black, chas, dis, indus, lstat, medv, nox, ptratio, rad,
## rm, target, tax, zn
table(chas_predicts, chas)## chas
## chas_predicts 0 1
## 0 433 33
misClasificError <- mean(chas_predicts != chas)
print(paste('Accuracy',1-misClasificError))## [1] "Accuracy 0.929184549356223"
t <- table(crime[ , c(3,14)])
colnames(t) <- c('Real Positive', 'Real Negative')
rownames(t) <- c('Model Positive', 'Model Negative')
t## target
## chas Real Positive Real Negative
## Model Positive 225 208
## Model Negative 12 21
getConfusionMatrix <- function(df) {
t <- table(df$chas,df$target)
colnames(t) <- c('Real Positive', 'Real Negative')
rownames(t) <- c('Model Positive', 'Model Negative')
return(t)
}
getAccuracy <- function(df) {
cm <- getConfusionMatrix(df)
tn <- cm["Model Negative", "Real Negative"]
fp <- cm["Model Positive", "Real Negative"]
fn <- cm["Model Negative", "Real Positive"]
tp <- cm["Model Positive", "Real Positive"]
accuracy <- (tn+tp) / (tn+fp+fn+tp)
return(accuracy)
}
getCER <- function(df) {
cm <- getConfusionMatrix(df)
tn <- cm["Model Negative", "Real Negative"]
fp <- cm["Model Positive", "Real Negative"]
fn <- cm["Model Negative", "Real Positive"]
tp <- cm["Model Positive", "Real Positive"]
cer <- (fp + fn) / (tn+fp+fn+tp)
return(cer)
}
getPrecision <- function(df) {
cm <- getConfusionMatrix(df)
tn <- cm["Model Negative", "Real Negative"]
fp <- cm["Model Positive", "Real Negative"]
fn <- cm["Model Negative", "Real Positive"]
tp <- cm["Model Positive", "Real Positive"]
precision <- tp / (fp+tp)
return(precision)
}
getSensitivity <- function(df){
cm <- getConfusionMatrix(df)
tn <- cm["Model Negative", "Real Negative"]
fp <- cm["Model Positive", "Real Negative"]
fn <- cm["Model Negative", "Real Positive"]
tp <- cm["Model Positive", "Real Positive"]
sensitivity <- tp/(tp+fn)
return(sensitivity)
}
getSpecificity <- function(df){
cm <- getConfusionMatrix(df)
tn <- cm["Model Negative", "Real Negative"]
fp <- cm["Model Positive", "Real Negative"]
fn <- cm["Model Negative", "Real Positive"]
tp <- cm["Model Positive", "Real Positive"]
specificity <- tn / (tn+fp)
return(specificity)
}
getF1Score <- function(df) {
cm <- getConfusionMatrix(df)
tn <- cm["Model Negative", "Real Negative"]
fp <- cm["Model Positive", "Real Negative"]
fn <- cm["Model Negative", "Real Positive"]
tp <- cm["Model Positive", "Real Positive"]
precision <- getPrecision(df)
sensitivity <- getSensitivity(df)
f1score <- 2* (precision * sensitivity) / (precision+sensitivity)
return(f1score)
}Confusion Matrix
getConfusionMatrix(crime)
Accuracy
getAccuracy(crime)## [1] 0.527897
CER
getCER(crime)## [1] 0.472103
Precision
getPrecision(crime)## [1] 0.5196305
F1Score
getF1Score(crime)## [1] 0.6716418
Specificity
getSpecificity(crime)## [1] 0.09170306
Sensitivity
getSensitivity(crime);## [1] 0.9493671