if (!require("ggplot2",character.only = TRUE)) (install.packages("ggplot2",dep=TRUE))
## Loading required package: ggplot2
if (!require("MASS",character.only = TRUE)) (install.packages("MASS",dep=TRUE))
## Loading required package: MASS
if (!require("knitr",character.only = TRUE)) (install.packages("knitr",dep=TRUE))
## Loading required package: knitr
if (!require("xtable",character.only = TRUE)) (install.packages("xtable",dep=TRUE))
## Loading required package: xtable
if (!require("dplyr",character.only = TRUE)) (install.packages("dplyr",dep=TRUE))
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
if (!require("psych",character.only = TRUE)) (install.packages("psych",dep=TRUE))
## Loading required package: psych
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
if (!require("stringr",character.only = TRUE)) (install.packages("stringr",dep=TRUE))
## Loading required package: stringr
if (!require("car",character.only = TRUE)) (install.packages("car",dep=TRUE))
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
if (!require("e1071",character.only = TRUE)) (install.packages("e1071",dep=TRUE))
## Loading required package: e1071
if (!require("ROCR",character.only = TRUE)) (install.packages("ROCR",dep=TRUE))
## Loading required package: ROCR
#if (!require("faraway",character.only = TRUE)) (install.packages("faraway",dep=TRUE))
library(ggplot2)
library(MASS)
library(knitr)
library(xtable)
library(dplyr)
library(psych)
library(stringr)
library(car)
library(caret)
## Loading required package: lattice
library(e1071)
library(ROCR)
#library(faraway)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.0 v purrr 0.3.4
## v tidyr 1.1.3 v forcats 0.5.1
## v readr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x psych::%+%() masks ggplot2::%+%()
## x psych::alpha() masks ggplot2::alpha()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## x car::recode() masks dplyr::recode()
## x dplyr::select() masks MASS::select()
## x purrr::some() masks car::some()
library(dplyr)
crime_eval_df <- read.csv("https://raw.githubusercontent.com/johnm1990/DATA621/main/hw3/crime-evaluation-data_modified.csv")
crime_train_df <- read.csv("https://raw.githubusercontent.com/johnm1990/DATA621/main/hw3/crime-training-data_modified.csv")
Intro:
# create summaries for every variable showing basic summaries + NAs
### add the standard deviations to our summaries
summary(crime_train_df)
## zn indus chas nox
## Min. : 0.00 Min. : 0.460 Min. :0.00000 Min. :0.3890
## 1st Qu.: 0.00 1st Qu.: 5.145 1st Qu.:0.00000 1st Qu.:0.4480
## Median : 0.00 Median : 9.690 Median :0.00000 Median :0.5380
## Mean : 11.58 Mean :11.105 Mean :0.07082 Mean :0.5543
## 3rd Qu.: 16.25 3rd Qu.:18.100 3rd Qu.:0.00000 3rd Qu.:0.6240
## Max. :100.00 Max. :27.740 Max. :1.00000 Max. :0.8710
## rm age dis rad
## Min. :3.863 Min. : 2.90 Min. : 1.130 Min. : 1.00
## 1st Qu.:5.887 1st Qu.: 43.88 1st Qu.: 2.101 1st Qu.: 4.00
## Median :6.210 Median : 77.15 Median : 3.191 Median : 5.00
## Mean :6.291 Mean : 68.37 Mean : 3.796 Mean : 9.53
## 3rd Qu.:6.630 3rd Qu.: 94.10 3rd Qu.: 5.215 3rd Qu.:24.00
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.00
## tax ptratio lstat medv
## Min. :187.0 Min. :12.6 Min. : 1.730 Min. : 5.00
## 1st Qu.:281.0 1st Qu.:16.9 1st Qu.: 7.043 1st Qu.:17.02
## Median :334.5 Median :18.9 Median :11.350 Median :21.20
## Mean :409.5 Mean :18.4 Mean :12.631 Mean :22.59
## 3rd Qu.:666.0 3rd Qu.:20.2 3rd Qu.:16.930 3rd Qu.:25.00
## Max. :711.0 Max. :22.0 Max. :37.970 Max. :50.00
## target
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4914
## 3rd Qu.:1.0000
## Max. :1.0000
# Scatterplots between the independednt variables and the # wins
# ggplot(data = crime_train_df) +
# geom_point(mapping = aes(x = , y= target))
##matrix of scatterplots
###Change the scale
# Simple Bar Plot, adjust the scale of the bar plot
counts <- table(crime_train_df$target)
barplot(counts, main="Crime Distribution",
xlab="Number of Neighborhoods")
#scatterpltos for the target and predictors
pairs(~target + dis + lstat + ptratio ,
pch = 19, data = crime_train_df)
#tax is very notable and makes sense to transforming, we do a log transformation, min 187 value, max 711
#this will go into our log_tax
crime_train_df$log_tax <- log(crime_train_df$tax)
##summary(crime_train_df)
#crime_train_df$chas <- as.factor(crime_train_df$chas)
#crime_train_df$target <- as.factor(crime_train_df$target)
crime_train_df$statbuk <- as.numeric(cut_number(crime_train_df$lstat,5))
table(crime_train_df$statbuk)
##
## 1 2 3 4 5
## 95 92 93 94 92
#check if high SDs, then transform
apply(crime_train_df,2,sd)
## zn indus chas nox rm age
## 23.3646511 6.8458549 0.2567920 0.1166667 0.7048513 28.3213784
## dis rad tax ptratio lstat medv
## 2.1069496 8.6859272 167.9000887 2.1968447 7.1018907 9.2396814
## target log_tax statbuk
## 0.5004636 0.3943914 1.4172256
summary(crime_train_df)
## zn indus chas nox
## Min. : 0.00 Min. : 0.460 Min. :0.00000 Min. :0.3890
## 1st Qu.: 0.00 1st Qu.: 5.145 1st Qu.:0.00000 1st Qu.:0.4480
## Median : 0.00 Median : 9.690 Median :0.00000 Median :0.5380
## Mean : 11.58 Mean :11.105 Mean :0.07082 Mean :0.5543
## 3rd Qu.: 16.25 3rd Qu.:18.100 3rd Qu.:0.00000 3rd Qu.:0.6240
## Max. :100.00 Max. :27.740 Max. :1.00000 Max. :0.8710
## rm age dis rad
## Min. :3.863 Min. : 2.90 Min. : 1.130 Min. : 1.00
## 1st Qu.:5.887 1st Qu.: 43.88 1st Qu.: 2.101 1st Qu.: 4.00
## Median :6.210 Median : 77.15 Median : 3.191 Median : 5.00
## Mean :6.291 Mean : 68.37 Mean : 3.796 Mean : 9.53
## 3rd Qu.:6.630 3rd Qu.: 94.10 3rd Qu.: 5.215 3rd Qu.:24.00
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.00
## tax ptratio lstat medv
## Min. :187.0 Min. :12.6 Min. : 1.730 Min. : 5.00
## 1st Qu.:281.0 1st Qu.:16.9 1st Qu.: 7.043 1st Qu.:17.02
## Median :334.5 Median :18.9 Median :11.350 Median :21.20
## Mean :409.5 Mean :18.4 Mean :12.631 Mean :22.59
## 3rd Qu.:666.0 3rd Qu.:20.2 3rd Qu.:16.930 3rd Qu.:25.00
## Max. :711.0 Max. :22.0 Max. :37.970 Max. :50.00
## target log_tax statbuk
## Min. :0.0000 Min. :5.231 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:5.638 1st Qu.:2.000
## Median :0.0000 Median :5.813 Median :3.000
## Mean :0.4914 Mean :5.935 Mean :2.991
## 3rd Qu.:1.0000 3rd Qu.:6.501 3rd Qu.:4.000
## Max. :1.0000 Max. :6.567 Max. :5.000
#ZN has high standard deviation, considering the fact that the range of the values is not that huge
#this makes it a high candidate for transformation since high SD
#standard deviation says, that the average deviation of all the values from the mean,
# for example looking at indus, mean is 11.105, the average deviation from the mean is 6.84 and whole range of values is between 0.4 to 27
# this allows us to see "how spaced out our values are"
# the variables for example age is understandable to have high SD, age is spaced out generally
#if we do regression coefficient, for age, we could say as age increase by 1 year, the crime will increase by X units
#transforming variables come with a cost, unless their is a clear need, we should always be conserative. As to not effect interpretability
#looking at RAD we have relatively big standard deviation, range of values from 1 min to 24 max.
## among the candidates, for transformation, we see medv, zn, indus, rad
## we always must bear in mind, if we transform the variables, the way to interpret the coefficients will be different when we do our regression model
Many types of statistical data exhibit a “variance-on-mean relationship”, meaning that the variability is different for data values with different expected values. The log transformation can be used to make highly skewed distributions less skewed. This can be valuable both for making patterns in the data more interpretable and for helping to meet the assumptions of inferential statistics.
A variance-stabilizing transformation aims to remove a variance-on-mean relationship, so that the variance becomes constant relative to the mean. Examples of variance-stabilizing transformations are the Fisher transformation for the sample correlation coefficient, the square root transformation or Anscombe transform for Poisson data (count data), the Box–Cox transformation for regression analysis, and the arcsine square root transformation or angular transformation for proportions (binomial data). While commonly used for statistical analysis of proportional data, the arcsine square root transformation is not recommended because logistic regression or a logit transformation are more appropriate for binomial or non-binomial proportions, respectively, especially due to decreased type-II error
#transform into logs for high standard deviation
crime_train_df$zn_log <- log(crime_train_df$zn)
crime_train_df$rad_log <- log(crime_train_df$rad)
##INDUS compared to RAD has a higher range, however RAD has higher standard deviation than INDUS
For the first model we can utilize all the explanatory variables we have.This is what differentiates a logistic regression from a linear model regression, we must specify family = binomial, this means our target variables is a binomial variable, which takes 0 or 1, it is not a continuous variable
Logistic regression is a method for fitting a regression curve, y = f(x), when y is a categorical variable. The typical use of this model is predicting y given a set of predictors x. The predictors can be continuous, categorical or a mix of both.
The categorical variable y, in general, can assume different values. In the simplest case scenario y is binary meaning that it can assume either the value 1 or 0. In this example used in for our classification assignment each record has a response variable indicative of whether or not the crime rate is above the median crime rate (1) or not (o)
we call the model “binomial logistic regression”, since the variable to predict is binary
##we have variables that are perfectly correlated, like tax and log_tax, if we include this, we have multicolinearity. our first model is without our transformations
The Akaike information criterion (AIC) is an estimator of prediction error and thereby relative quality of statistical models for a given set of data. Given a collection of models for the data, AIC estimates the quality of each model, relative to each of the other models.
The AIC function is 2K – 2(log-likelihood). Lower AIC values indicate a better-fit model, and a model with a delta-AIC (the difference between the two AIC values being compared) of more than -2 is considered significantly better than the model it is being compared to.
# model 1: All variables in original units
crime_train_df_ori <- crime_train_df[,1:13]
crime_model1 <- glm(as.numeric(target) ~ ., data = crime_train_df_ori, family = "binomial")
summary(crime_model1)
##
## Call:
## glm(formula = as.numeric(target) ~ ., family = "binomial", data = crime_train_df_ori)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8464 -0.1445 -0.0017 0.0029 3.4665
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -40.822934 6.632913 -6.155 7.53e-10 ***
## zn -0.065946 0.034656 -1.903 0.05706 .
## indus -0.064614 0.047622 -1.357 0.17485
## chas 0.910765 0.755546 1.205 0.22803
## nox 49.122297 7.931706 6.193 5.90e-10 ***
## rm -0.587488 0.722847 -0.813 0.41637
## age 0.034189 0.013814 2.475 0.01333 *
## dis 0.738660 0.230275 3.208 0.00134 **
## rad 0.666366 0.163152 4.084 4.42e-05 ***
## tax -0.006171 0.002955 -2.089 0.03674 *
## ptratio 0.402566 0.126627 3.179 0.00148 **
## lstat 0.045869 0.054049 0.849 0.39608
## medv 0.180824 0.068294 2.648 0.00810 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 192.05 on 453 degrees of freedom
## AIC: 218.05
##
## Number of Fisher Scoring iterations: 9
#
Forward selection means that we are going to start with the variable that has the most significance on our response variable in the first round, then add second, third and so on. Until we stop at the point where adding more variables and it doesn’t improve models performance
Backward selection is the reverse, we start with a full model and then we take off variables from the model, starting with variable with least significance til we reach the performance we aimed for.
The higher the P value is the more significant it is on the target variable. In this case If neighborhood is above median crime rate of the city. Highest p value = lowest significance, we can take these out little by little until the model improves. We do this until we take out variables and we notice reduction in performance of our model
#use dataframe that has the transformations, but used the log transformations that were created. Assume these variables have high standard deviation, this may have caused impact on our results. We usually start with all our desired variables, and remove the ones
#couldn't use zn_log, min = -Inf, may be because of several 0 values (use summary to troubleshoot issues)
#we can "play around" by inserting, taking out variables to see performance change
#model 2: all variables with log transformation for tax and rad
crime_model2 <- glm(as.numeric(target) ~ zn+indus+chas+nox+rm+age+dis+rad_log+log_tax+ptratio+lstat+medv, data = crime_train_df, family = "binomial")
summary(crime_model2)
##
## Call:
## glm(formula = as.numeric(target) ~ zn + indus + chas + nox +
## rm + age + dis + rad_log + log_tax + ptratio + lstat + medv,
## family = "binomial", data = crime_train_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8575 -0.1235 -0.0007 0.0707 3.5004
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -34.90145 8.39847 -4.156 3.24e-05 ***
## zn -0.06455 0.03375 -1.913 0.055805 .
## indus -0.06326 0.04960 -1.276 0.202106
## chas 1.00455 0.74399 1.350 0.176947
## nox 47.37678 7.78125 6.089 1.14e-09 ***
## rm -0.47292 0.71479 -0.662 0.508218
## age 0.03422 0.01365 2.507 0.012163 *
## dis 0.72790 0.22670 3.211 0.001323 **
## rad_log 3.18812 0.76229 4.182 2.89e-05 ***
## log_tax -1.63697 1.15806 -1.414 0.157495
## ptratio 0.41666 0.12560 3.317 0.000908 ***
## lstat 0.03535 0.05411 0.653 0.513575
## medv 0.17868 0.06844 2.611 0.009039 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 195.65 on 453 degrees of freedom
## AIC: 221.65
##
## Number of Fisher Scoring iterations: 8
High p-values indicate that your evidence is not strong enough to suggest an effect exists in the population. An effect might exist but it’s possible that the effect size is too small, the sample size is too small, or there is too much variability for the hypothesis test to detect it.
Notice, chas1 is a dummy variable
#model 3: Backward selection, removing variables one by one based on the p-value
crime_model3 <- glm(as.numeric(target) ~ zn+nox+age+dis+rad+tax+ptratio+medv, data = crime_train_df, family = "binomial")
summary(crime_model3)
##
## Call:
## glm(formula = as.numeric(target) ~ zn + nox + age + dis + rad +
## tax + ptratio + medv, family = "binomial", data = crime_train_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8295 -0.1752 -0.0021 0.0032 3.4191
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -37.415922 6.035013 -6.200 5.65e-10 ***
## zn -0.068648 0.032019 -2.144 0.03203 *
## nox 42.807768 6.678692 6.410 1.46e-10 ***
## age 0.032950 0.010951 3.009 0.00262 **
## dis 0.654896 0.214050 3.060 0.00222 **
## rad 0.725109 0.149788 4.841 1.29e-06 ***
## tax -0.007756 0.002653 -2.924 0.00346 **
## ptratio 0.323628 0.111390 2.905 0.00367 **
## medv 0.110472 0.035445 3.117 0.00183 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 197.32 on 457 degrees of freedom
## AIC: 215.32
##
## Number of Fisher Scoring iterations: 9
#the less variables in the model, the better. We are reducing the number of variables and getting better quality. We #keep removing the variables with highest p-value
#we reach a point where our model includes only all significant variables. no need to remove any other variable
#if we were to keep removing variables unnecessarily, this will negatively impact our quality performance
Notice the estimates above. The coefficients, for example if it has variables that the coefficients are not intuitive in the model, like tax, it has negative effect on crime rate. Tax has marginal effect on crime. Sometimes we should keep some margin for data to change our views. Get insights out of our data.
logistic regression models interpretation: if NOX increases by 1 unit, the likelihood of an neighborhood to be above the median increase by 42%
if every 1 year of age increase will increase the likelihood of neighborhood being above the median crime rate .031%
if increase of $1 dollar of tax will decrease the likelihood of neighborhood to be above the median crime rate .008% dispute it’s low in magnitude, it’s high in significance, we are sure of that it has en effect, small effect, but we are sure of it *******
Decide on the criteria for selecting the best binary logistic regression model. Will you select models with slightly worse performance if it makes more sense or is more parsimonious? Discuss why you selected your models.
For the binary logistic regression model, will you use a metric such as log likelihood, AIC, ROC curve, etc.? Using the training data set, evaluate the binary logistic regression model based on (a) accuracy, (b) classification error rate, (c) precision, (d) sensitivity, (e) specificity, (f) F1 score, (g) AUC, and (h) confusion matrix. Make predictions using the evaluation data set.
Whether or not crime rate of neighborhoods are above or below the median. We will pick the model with best performance. Even if some of the coefficients were counter-intuitive, then this is insightful in some sense. That we expect is not what we are seeing. Given that all the variables in our backward selection model are significant, and the model itself is the best in performance. We didn’t use as many variables as the full model, however all the variables in our model are proven significant. For the binary logistic regression we chose(crime_model_3), we utilized the AIC to evaluate the models performance. The model we chose was the model with least AIC, 215.32.
Accuracy means the total number of correctly predicted outcomes over total number of predictions. The default value for the threshold is 0.5 for normalized predicted probabilities or scores in the range between 0 or 1.
We start with our confusion matrix. Confusion matrix shows us Predicted as 0 and was 0 or Predicted as 1 and was 1.
Two indices are used to evaluate the accuracy of a test that predicts dichotomous outcomes ( e.g. logistic regression) - sensitivity and specificity. They describe how well a test discriminates between cases with and without a certain condition.
Sensitivity - the proportion of true positives or the proportion of cases correctly identified by the test as meeting a certain condition.
Specificity - the proportion of true negatives or the proportion of cases correctly identified by the test as not meeting a certain condition.
The F1 score can be interpreted as a harmonic mean of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are equal. The formula for the F1 score is:
F1 = 2 * (precision * recall) / (precision + recall)
threshold=0.5
predicted_values<-ifelse(predict(crime_model3,type="response")>threshold,1,0)
actual_values<-crime_model3$y
conf_matrix<-table(predicted_values,actual_values)
conf_matrix
## actual_values
## predicted_values 0 1
## 0 218 22
## 1 19 207
sensitivity(conf_matrix)
## [1] 0.9198312
specificity(conf_matrix)
## [1] 0.9039301
result<-confusionMatrix(conf_matrix)
precision <- result$byClass['Pos Pred Value']
precision
## Pos Pred Value
## 0.9083333
class_error_rate <- 1-result$overall['Accuracy']
class_error_rate
## Accuracy
## 0.08798283
f1 <- result$byClass['F1']
f1
## F1
## 0.9140461
p <- predict(crime_model3, type = "response")
roc_pred <- prediction(predictions = p,labels=crime_model3$y)
auc.tmp <- performance(roc_pred,"auc"); auc <- as.numeric(auc.tmp@y.values)
auc
## [1] 0.9719382
#plotting roc
roc_perf <- performance(roc_pred , "tpr" , "fpr")
plot(roc_perf,
colorize = TRUE,
print.cutoffs.at= seq(0,1,0.05),
text.adj=c(-0.2,1.7))
threshold=0.5
crime_eval_df$target<-ifelse(predict(crime_model3,crime_eval_df,type="response")>threshold,1,0)