if (!require("ggplot2",character.only = TRUE)) (install.packages("ggplot2",dep=TRUE))
## Loading required package: ggplot2
if (!require("MASS",character.only = TRUE)) (install.packages("MASS",dep=TRUE))
## Loading required package: MASS
if (!require("knitr",character.only = TRUE)) (install.packages("knitr",dep=TRUE))
## Loading required package: knitr
if (!require("xtable",character.only = TRUE)) (install.packages("xtable",dep=TRUE))
## Loading required package: xtable
if (!require("dplyr",character.only = TRUE)) (install.packages("dplyr",dep=TRUE))
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
if (!require("psych",character.only = TRUE)) (install.packages("psych",dep=TRUE))
## Loading required package: psych
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
if (!require("stringr",character.only = TRUE)) (install.packages("stringr",dep=TRUE))
## Loading required package: stringr
if (!require("car",character.only = TRUE)) (install.packages("car",dep=TRUE))
## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
if (!require("e1071",character.only = TRUE)) (install.packages("e1071",dep=TRUE))
## Loading required package: e1071
if (!require("ROCR",character.only = TRUE)) (install.packages("ROCR",dep=TRUE))
## Loading required package: ROCR
#if (!require("faraway",character.only = TRUE)) (install.packages("faraway",dep=TRUE))

library(ggplot2)
library(MASS)
library(knitr)
library(xtable)
library(dplyr)
library(psych)
library(stringr)
library(car)
library(caret)
## Loading required package: lattice
library(e1071)
library(ROCR)
#library(faraway)

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.0     v purrr   0.3.4
## v tidyr   1.1.3     v forcats 0.5.1
## v readr   1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x psych::%+%()    masks ggplot2::%+%()
## x psych::alpha()  masks ggplot2::alpha()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()
## x car::recode()   masks dplyr::recode()
## x dplyr::select() masks MASS::select()
## x purrr::some()   masks car::some()
library(dplyr)

crime_eval_df <- read.csv("https://raw.githubusercontent.com/johnm1990/DATA621/main/hw3/crime-evaluation-data_modified.csv")
crime_train_df <- read.csv("https://raw.githubusercontent.com/johnm1990/DATA621/main/hw3/crime-training-data_modified.csv")

Intro:

# create summaries for every variable showing basic summaries + NAs
### add the standard deviations to our summaries
summary(crime_train_df)
##        zn             indus             chas              nox        
##  Min.   :  0.00   Min.   : 0.460   Min.   :0.00000   Min.   :0.3890  
##  1st Qu.:  0.00   1st Qu.: 5.145   1st Qu.:0.00000   1st Qu.:0.4480  
##  Median :  0.00   Median : 9.690   Median :0.00000   Median :0.5380  
##  Mean   : 11.58   Mean   :11.105   Mean   :0.07082   Mean   :0.5543  
##  3rd Qu.: 16.25   3rd Qu.:18.100   3rd Qu.:0.00000   3rd Qu.:0.6240  
##  Max.   :100.00   Max.   :27.740   Max.   :1.00000   Max.   :0.8710  
##        rm             age              dis              rad       
##  Min.   :3.863   Min.   :  2.90   Min.   : 1.130   Min.   : 1.00  
##  1st Qu.:5.887   1st Qu.: 43.88   1st Qu.: 2.101   1st Qu.: 4.00  
##  Median :6.210   Median : 77.15   Median : 3.191   Median : 5.00  
##  Mean   :6.291   Mean   : 68.37   Mean   : 3.796   Mean   : 9.53  
##  3rd Qu.:6.630   3rd Qu.: 94.10   3rd Qu.: 5.215   3rd Qu.:24.00  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.00  
##       tax           ptratio         lstat             medv      
##  Min.   :187.0   Min.   :12.6   Min.   : 1.730   Min.   : 5.00  
##  1st Qu.:281.0   1st Qu.:16.9   1st Qu.: 7.043   1st Qu.:17.02  
##  Median :334.5   Median :18.9   Median :11.350   Median :21.20  
##  Mean   :409.5   Mean   :18.4   Mean   :12.631   Mean   :22.59  
##  3rd Qu.:666.0   3rd Qu.:20.2   3rd Qu.:16.930   3rd Qu.:25.00  
##  Max.   :711.0   Max.   :22.0   Max.   :37.970   Max.   :50.00  
##      target      
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.4914  
##  3rd Qu.:1.0000  
##  Max.   :1.0000
# Scatterplots between the independednt variables and the # wins
# ggplot(data = crime_train_df) +
#   geom_point(mapping = aes(x = , y= target))

##matrix of scatterplots

###Change the scale
# Simple Bar Plot, adjust the scale of the bar plot
counts <- table(crime_train_df$target)
barplot(counts, main="Crime Distribution", 
   xlab="Number of Neighborhoods")

#scatterpltos for the target and predictors
pairs(~target + dis +  lstat + ptratio ,
      pch = 19, data = crime_train_df)

#tax is very notable and makes sense to transforming, we do a log transformation, min 187 value, max 711
#this will go into our log_tax
crime_train_df$log_tax <- log(crime_train_df$tax)
##summary(crime_train_df)

#crime_train_df$chas <-  as.factor(crime_train_df$chas)
#crime_train_df$target <-  as.factor(crime_train_df$target)


crime_train_df$statbuk <- as.numeric(cut_number(crime_train_df$lstat,5))
table(crime_train_df$statbuk)
## 
##  1  2  3  4  5 
## 95 92 93 94 92
#check if high SDs, then transform
apply(crime_train_df,2,sd)
##          zn       indus        chas         nox          rm         age 
##  23.3646511   6.8458549   0.2567920   0.1166667   0.7048513  28.3213784 
##         dis         rad         tax     ptratio       lstat        medv 
##   2.1069496   8.6859272 167.9000887   2.1968447   7.1018907   9.2396814 
##      target     log_tax     statbuk 
##   0.5004636   0.3943914   1.4172256
summary(crime_train_df)
##        zn             indus             chas              nox        
##  Min.   :  0.00   Min.   : 0.460   Min.   :0.00000   Min.   :0.3890  
##  1st Qu.:  0.00   1st Qu.: 5.145   1st Qu.:0.00000   1st Qu.:0.4480  
##  Median :  0.00   Median : 9.690   Median :0.00000   Median :0.5380  
##  Mean   : 11.58   Mean   :11.105   Mean   :0.07082   Mean   :0.5543  
##  3rd Qu.: 16.25   3rd Qu.:18.100   3rd Qu.:0.00000   3rd Qu.:0.6240  
##  Max.   :100.00   Max.   :27.740   Max.   :1.00000   Max.   :0.8710  
##        rm             age              dis              rad       
##  Min.   :3.863   Min.   :  2.90   Min.   : 1.130   Min.   : 1.00  
##  1st Qu.:5.887   1st Qu.: 43.88   1st Qu.: 2.101   1st Qu.: 4.00  
##  Median :6.210   Median : 77.15   Median : 3.191   Median : 5.00  
##  Mean   :6.291   Mean   : 68.37   Mean   : 3.796   Mean   : 9.53  
##  3rd Qu.:6.630   3rd Qu.: 94.10   3rd Qu.: 5.215   3rd Qu.:24.00  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.00  
##       tax           ptratio         lstat             medv      
##  Min.   :187.0   Min.   :12.6   Min.   : 1.730   Min.   : 5.00  
##  1st Qu.:281.0   1st Qu.:16.9   1st Qu.: 7.043   1st Qu.:17.02  
##  Median :334.5   Median :18.9   Median :11.350   Median :21.20  
##  Mean   :409.5   Mean   :18.4   Mean   :12.631   Mean   :22.59  
##  3rd Qu.:666.0   3rd Qu.:20.2   3rd Qu.:16.930   3rd Qu.:25.00  
##  Max.   :711.0   Max.   :22.0   Max.   :37.970   Max.   :50.00  
##      target          log_tax         statbuk     
##  Min.   :0.0000   Min.   :5.231   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:5.638   1st Qu.:2.000  
##  Median :0.0000   Median :5.813   Median :3.000  
##  Mean   :0.4914   Mean   :5.935   Mean   :2.991  
##  3rd Qu.:1.0000   3rd Qu.:6.501   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :6.567   Max.   :5.000
#ZN has high standard deviation, considering the fact that the range of the values is not that huge
#this makes it a high candidate for transformation since high SD

#standard deviation says, that the average deviation of all the values from the mean, 
# for example looking at indus, mean is 11.105, the average deviation from the mean is 6.84 and whole range of values is between 0.4 to 27
# this allows us to see "how spaced out our values are"

# the variables for example age is understandable to have high SD, age is spaced out generally

#if we do regression coefficient, for age, we could say as age increase by 1 year, the crime will increase by X units

#transforming variables come with a cost, unless their is a clear need, we should always be conserative. As to not effect interpretability

#looking at RAD we have relatively big standard deviation, range of values from 1 min to 24 max.




## among the candidates, for transformation, we see medv, zn, indus, rad
## we always must bear in mind, if we transform the variables, the way to interpret the coefficients will be different when we do our regression model

Many types of statistical data exhibit a “variance-on-mean relationship”, meaning that the variability is different for data values with different expected values. The log transformation can be used to make highly skewed distributions less skewed. This can be valuable both for making patterns in the data more interpretable and for helping to meet the assumptions of inferential statistics.

A variance-stabilizing transformation aims to remove a variance-on-mean relationship, so that the variance becomes constant relative to the mean. Examples of variance-stabilizing transformations are the Fisher transformation for the sample correlation coefficient, the square root transformation or Anscombe transform for Poisson data (count data), the Box–Cox transformation for regression analysis, and the arcsine square root transformation or angular transformation for proportions (binomial data). While commonly used for statistical analysis of proportional data, the arcsine square root transformation is not recommended because logistic regression or a logit transformation are more appropriate for binomial or non-binomial proportions, respectively, especially due to decreased type-II error

#transform into logs for high standard deviation
crime_train_df$zn_log <- log(crime_train_df$zn)
crime_train_df$rad_log <- log(crime_train_df$rad)

##INDUS compared to RAD has a higher range, however RAD has higher standard deviation than INDUS

For the first model we can utilize all the explanatory variables we have.This is what differentiates a logistic regression from a linear model regression, we must specify family = binomial, this means our target variables is a binomial variable, which takes 0 or 1, it is not a continuous variable

Logistic regression is a method for fitting a regression curve, y = f(x), when y is a categorical variable. The typical use of this model is predicting y given a set of predictors x. The predictors can be continuous, categorical or a mix of both.

The categorical variable y, in general, can assume different values. In the simplest case scenario y is binary meaning that it can assume either the value 1 or 0. In this example used in for our classification assignment each record has a response variable indicative of whether or not the crime rate is above the median crime rate (1) or not (o)

we call the model “binomial logistic regression”, since the variable to predict is binary

You can’t have factor/categorical response variables when using GLM apparently

##we have variables that are perfectly correlated, like tax and log_tax, if we include this, we have multicolinearity. our first model is without our transformations

The Akaike information criterion (AIC) is an estimator of prediction error and thereby relative quality of statistical models for a given set of data. Given a collection of models for the data, AIC estimates the quality of each model, relative to each of the other models.

The AIC function is 2K – 2(log-likelihood). Lower AIC values indicate a better-fit model, and a model with a delta-AIC (the difference between the two AIC values being compared) of more than -2 is considered significantly better than the model it is being compared to.

# model 1: All variables in original units
crime_train_df_ori <- crime_train_df[,1:13]
crime_model1 <- glm(as.numeric(target) ~ ., data = crime_train_df_ori, family = "binomial")
summary(crime_model1)
## 
## Call:
## glm(formula = as.numeric(target) ~ ., family = "binomial", data = crime_train_df_ori)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8464  -0.1445  -0.0017   0.0029   3.4665  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -40.822934   6.632913  -6.155 7.53e-10 ***
## zn           -0.065946   0.034656  -1.903  0.05706 .  
## indus        -0.064614   0.047622  -1.357  0.17485    
## chas          0.910765   0.755546   1.205  0.22803    
## nox          49.122297   7.931706   6.193 5.90e-10 ***
## rm           -0.587488   0.722847  -0.813  0.41637    
## age           0.034189   0.013814   2.475  0.01333 *  
## dis           0.738660   0.230275   3.208  0.00134 ** 
## rad           0.666366   0.163152   4.084 4.42e-05 ***
## tax          -0.006171   0.002955  -2.089  0.03674 *  
## ptratio       0.402566   0.126627   3.179  0.00148 ** 
## lstat         0.045869   0.054049   0.849  0.39608    
## medv          0.180824   0.068294   2.648  0.00810 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 192.05  on 453  degrees of freedom
## AIC: 218.05
## 
## Number of Fisher Scoring iterations: 9
#

Forward selection means that we are going to start with the variable that has the most significance on our response variable in the first round, then add second, third and so on. Until we stop at the point where adding more variables and it doesn’t improve models performance

Backward selection is the reverse, we start with a full model and then we take off variables from the model, starting with variable with least significance til we reach the performance we aimed for.

The higher the P value is the more significant it is on the target variable. In this case If neighborhood is above median crime rate of the city. Highest p value = lowest significance, we can take these out little by little until the model improves. We do this until we take out variables and we notice reduction in performance of our model

#use dataframe that has the transformations, but used the log transformations that were created. Assume these variables have high standard deviation, this may have caused impact on our results. We usually start with all our desired variables, and remove the ones

#couldn't use zn_log, min = -Inf, may be because of several 0 values (use summary to troubleshoot issues)

#we can "play around" by inserting, taking out variables to see performance change

#model 2: all variables with log transformation for tax and rad
crime_model2 <- glm(as.numeric(target) ~ zn+indus+chas+nox+rm+age+dis+rad_log+log_tax+ptratio+lstat+medv, data = crime_train_df, family = "binomial")
summary(crime_model2)
## 
## Call:
## glm(formula = as.numeric(target) ~ zn + indus + chas + nox + 
##     rm + age + dis + rad_log + log_tax + ptratio + lstat + medv, 
##     family = "binomial", data = crime_train_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8575  -0.1235  -0.0007   0.0707   3.5004  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -34.90145    8.39847  -4.156 3.24e-05 ***
## zn           -0.06455    0.03375  -1.913 0.055805 .  
## indus        -0.06326    0.04960  -1.276 0.202106    
## chas          1.00455    0.74399   1.350 0.176947    
## nox          47.37678    7.78125   6.089 1.14e-09 ***
## rm           -0.47292    0.71479  -0.662 0.508218    
## age           0.03422    0.01365   2.507 0.012163 *  
## dis           0.72790    0.22670   3.211 0.001323 ** 
## rad_log       3.18812    0.76229   4.182 2.89e-05 ***
## log_tax      -1.63697    1.15806  -1.414 0.157495    
## ptratio       0.41666    0.12560   3.317 0.000908 ***
## lstat         0.03535    0.05411   0.653 0.513575    
## medv          0.17868    0.06844   2.611 0.009039 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 195.65  on 453  degrees of freedom
## AIC: 221.65
## 
## Number of Fisher Scoring iterations: 8

High p-values indicate that your evidence is not strong enough to suggest an effect exists in the population. An effect might exist but it’s possible that the effect size is too small, the sample size is too small, or there is too much variability for the hypothesis test to detect it.

Notice, chas1 is a dummy variable

#model 3: Backward selection, removing variables one by one based on the p-value
crime_model3 <- glm(as.numeric(target) ~ zn+nox+age+dis+rad+tax+ptratio+medv, data = crime_train_df, family = "binomial")
summary(crime_model3)
## 
## Call:
## glm(formula = as.numeric(target) ~ zn + nox + age + dis + rad + 
##     tax + ptratio + medv, family = "binomial", data = crime_train_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8295  -0.1752  -0.0021   0.0032   3.4191  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -37.415922   6.035013  -6.200 5.65e-10 ***
## zn           -0.068648   0.032019  -2.144  0.03203 *  
## nox          42.807768   6.678692   6.410 1.46e-10 ***
## age           0.032950   0.010951   3.009  0.00262 ** 
## dis           0.654896   0.214050   3.060  0.00222 ** 
## rad           0.725109   0.149788   4.841 1.29e-06 ***
## tax          -0.007756   0.002653  -2.924  0.00346 ** 
## ptratio       0.323628   0.111390   2.905  0.00367 ** 
## medv          0.110472   0.035445   3.117  0.00183 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 197.32  on 457  degrees of freedom
## AIC: 215.32
## 
## Number of Fisher Scoring iterations: 9
#the less variables in the model, the better. We are reducing the number of variables and getting better quality. We #keep removing the variables with highest p-value
#we reach a point where our model includes only all significant variables. no need to remove any other variable

#if we were to keep removing variables unnecessarily, this will negatively impact our quality performance

Notice the estimates above. The coefficients, for example if it has variables that the coefficients are not intuitive in the model, like tax, it has negative effect on crime rate. Tax has marginal effect on crime. Sometimes we should keep some margin for data to change our views. Get insights out of our data.


logistic regression models interpretation: if NOX increases by 1 unit, the likelihood of an neighborhood to be above the median increase by 42%

if every 1 year of age increase will increase the likelihood of neighborhood being above the median crime rate .031%

if increase of $1 dollar of tax will decrease the likelihood of neighborhood to be above the median crime rate .008% dispute it’s low in magnitude, it’s high in significance, we are sure of that it has en effect, small effect, but we are sure of it *******

  1. SELECT MODELS (25 Points)

Decide on the criteria for selecting the best binary logistic regression model. Will you select models with slightly worse performance if it makes more sense or is more parsimonious? Discuss why you selected your models.

For the binary logistic regression model, will you use a metric such as log likelihood, AIC, ROC curve, etc.? Using the training data set, evaluate the binary logistic regression model based on (a) accuracy, (b) classification error rate, (c) precision, (d) sensitivity, (e) specificity, (f) F1 score, (g) AUC, and (h) confusion matrix. Make predictions using the evaluation data set.

Whether or not crime rate of neighborhoods are above or below the median. We will pick the model with best performance. Even if some of the coefficients were counter-intuitive, then this is insightful in some sense. That we expect is not what we are seeing. Given that all the variables in our backward selection model are significant, and the model itself is the best in performance. We didn’t use as many variables as the full model, however all the variables in our model are proven significant. For the binary logistic regression we chose(crime_model_3), we utilized the AIC to evaluate the models performance. The model we chose was the model with least AIC, 215.32.

Accuracy means the total number of correctly predicted outcomes over total number of predictions. The default value for the threshold is 0.5 for normalized predicted probabilities or scores in the range between 0 or 1.

We start with our confusion matrix. Confusion matrix shows us Predicted as 0 and was 0 or Predicted as 1 and was 1.

Two indices are used to evaluate the accuracy of a test that predicts dichotomous outcomes ( e.g. logistic regression) - sensitivity and specificity. They describe how well a test discriminates between cases with and without a certain condition.

Sensitivity - the proportion of true positives or the proportion of cases correctly identified by the test as meeting a certain condition.

Specificity - the proportion of true negatives or the proportion of cases correctly identified by the test as not meeting a certain condition.

The F1 score can be interpreted as a harmonic mean of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are equal. The formula for the F1 score is:

F1 = 2 * (precision * recall) / (precision + recall)

threshold=0.5
predicted_values<-ifelse(predict(crime_model3,type="response")>threshold,1,0)
actual_values<-crime_model3$y
conf_matrix<-table(predicted_values,actual_values)
conf_matrix
##                 actual_values
## predicted_values   0   1
##                0 218  22
##                1  19 207
sensitivity(conf_matrix)
## [1] 0.9198312
specificity(conf_matrix)
## [1] 0.9039301
result<-confusionMatrix(conf_matrix)






precision <- result$byClass['Pos Pred Value']
precision
## Pos Pred Value 
##      0.9083333
class_error_rate <- 1-result$overall['Accuracy']
class_error_rate
##   Accuracy 
## 0.08798283
f1 <- result$byClass['F1']
f1
##        F1 
## 0.9140461
p <- predict(crime_model3, type = "response")
roc_pred <- prediction(predictions = p,labels=crime_model3$y)

auc.tmp <- performance(roc_pred,"auc"); auc <- as.numeric(auc.tmp@y.values)
auc
## [1] 0.9719382
#plotting roc
roc_perf <- performance(roc_pred , "tpr" , "fpr")
plot(roc_perf,
     colorize = TRUE,
     print.cutoffs.at= seq(0,1,0.05),
     text.adj=c(-0.2,1.7))

threshold=0.5
crime_eval_df$target<-ifelse(predict(crime_model3,crime_eval_df,type="response")>threshold,1,0)