Introduction

In this homework assignment, you will explore, analyze and model a data set containing information on crime for various neighborhoods of a major city.

Each record has a response variable indicating whether or not the crime rate is above the median crime rate (1) or not (0).

Your objective is to build a binary logistic regression model on the training data set to predict whether the neighborhood will be at risk for high crime levels. You will provide classifications and probabilities for the evaluation data set using your binary logistic regression model. You can only use the variables given to you (or, variables that you derive from the variables provided).

Predictors zn: proportion of residential land zoned for large lots (over 25,000 square feet) indus: proportion of non-retail business acres per suburb chas: a dummy var. for whether the suburb borders the Charles River (1) or not (0) nox: nitrogen oxides concentration (parts per 10 million) rm: average number of rooms per dwelling age: proportion of owner-occupied units built prior to 1940 dis: weighted mean of distances to five Boston employment centers rad: index of accessibility to radial highways tax: full-value property-tax rate per $10,000 ptratio: pupil-teacher ratio by town lstat: lower status of the population (percent) medv: median value of owner-occupied homes in $1,000s

target: whether the crime rate is above the median crime rate (1) or not (0) (response variable)

crime_train_data <- read_csv("crime-training-data_modified.csv")

## Rows: 466 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (13): zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, lstat, medv...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

crime_eval_data <- read_csv("crime-evaluation-data_modified.csv")

## Rows: 40 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (12): zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, lstat, medv
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

EXPLORATORY DATA ANALYSIS

head(crime_train_data)

## # A tibble: 6 × 13
##      zn indus  chas   nox    rm   age   dis   rad   tax ptratio lstat  medv
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>
## 1     0 19.6      0 0.605  7.93  96.2  2.05     5   403    14.7  3.7   50  
## 2     0 19.6      1 0.871  5.40 100    1.32     5   403    14.7 26.8   13.4
## 3     0 18.1      0 0.74   6.48 100    1.98    24   666    20.2 18.8   15.4
## 4    30  4.93     0 0.428  6.39   7.8  7.04     6   300    16.6  5.19  23.7
## 5     0  2.46     0 0.488  7.16  92.2  2.70     3   193    17.8  4.82  37.9
## 6     0  8.56     0 0.52   6.78  71.3  2.86     5   384    20.9  7.67  26.5
## # ℹ 1 more variable: target <dbl>

# View summary stats and missing values
skim_df_as_data_frame <- skim(crime_train_data) %>% arrange(desc(n_missing))
skim_df_as_data_frame <- skim_df_as_data_frame %>% mutate(
        numeric.mean = round(numeric.mean, 2),
        numeric.sd = round(numeric.sd, 2),
        numeric.p0 = round(numeric.p0, 2),
        numeric.p25 = round(numeric.p25, 2),
        numeric.p50 = round(numeric.p50, 2), 
        numeric.p75 = round(numeric.p75, 2),
        numeric.p100 = round(numeric.p100, 2)# and so on for other numeric summaries
      )

skim_df_as_data_frame

Data summary
Name	crime_train_data
Number of rows	466
Number of columns	13
_______________________
Column type frequency:
numeric	13
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
zn	1	11.58	23.36	0.00	0.00	0.00	16.25	100.00	▇▁▁▁▁
indus	1	11.11	6.85	0.46	5.15	9.69	18.10	27.74	▇▆▁▇▁
chas	1	0.07	0.26	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
nox	1	0.55	0.12	0.39	0.45	0.54	0.62	0.87	▇▇▅▃▁
rm	1	6.29	0.70	3.86	5.89	6.21	6.63	8.78	▁▂▇▂▁
age	1	68.37	28.32	2.90	43.88	77.15	94.10	100.00	▂▂▂▃▇
dis	1	3.80	2.11	1.13	2.10	3.19	5.21	12.13	▇▅▂▁▁
rad	1	9.53	8.69	1.00	4.00	5.00	24.00	24.00	▇▂▁▁▃
tax	1	409.50	167.90	187.00	281.00	334.50	666.00	711.00	▇▇▅▁▇
ptratio	1	18.40	2.20	12.60	16.90	18.90	20.20	22.00	▁▃▅▅▇
lstat	1	12.63	7.10	1.73	7.04	11.35	16.93	37.97	▇▇▅▂▁
medv	1	22.59	9.24	5.00	17.02	21.20	25.00	50.00	▂▇▅▁▁
target	1	0.49	0.50	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▇

# save output to csv
write.csv(skim_df_as_data_frame, "H3_skim_output.csv", row.names = FALSE)

There are no missing values in the data set.

Target Variable: Number of neighborhoods whose crime rate is above or below the median

crime_train_data %>% mutate(target_renamed = ifelse(target == 0, "Below", "Above")) %>% ggplot(aes(x = as_factor(target_renamed))) + geom_bar(color = "#686868") + 
  geom_text(aes(label = after_stat(count)), stat = "count", vjust = 10, color = "white") +
  labs(title = "Number of Neighborhoods Above or Below Median Crime Rate", 
         y = "Count",
         x = "Crime Rate") + 
    theme_minimal()

columns <- colnames(crime_train_data)
columns <- columns[columns != "chas"]

crime_train_data %>% mutate(target_renamed = ifelse(target == 0, "Below", "Above")) %>%
  pivot_longer(cols = columns[-length(columns)], names_to = "Feature", values_to = "Value") %>% ggplot(aes(x = as_factor(target_renamed), y = Value)) + geom_boxplot(color = "#686868") + 
  facet_wrap(~ Feature, nrow = 4, ncol = 3, scales = "free") + labs(title = "Values") +  
  labs(title = "Distribution Between Neighborhood Crime Rates ", 
         y = "",
         x = "Crime Rate") + 
    theme_minimal()

# Evaluate variable 'chas' separately 
crime_train_data %>% mutate(target_renamed = ifelse(target == 0, "Below", "Above")) %>% 
  ggplot(aes(x = as_factor(target_renamed), fill = as_factor(chas))) + geom_bar(position = "stack") + 
  labs(title = "Number of Neighborhoods Boardering the Charles River in each Target Category",
       y = "Count",
       x = "Crime Rate",
       fill="Boarders River") + 
    theme_minimal()

There is a slightly higher proportion of neighborhoods boarding the Charles River in the group with crime rates above the median.

crime_train_data %>% mutate(target_renamed = ifelse(target == 0, "Below", "Above")) %>%
  pivot_longer(cols = columns[-length(columns)], names_to = "Feature", values_to = "Value") %>% ggplot(aes(x = Value, fill = as_factor(target_renamed))) + geom_histogram() + 
  facet_wrap(~ Feature, nrow = 4, ncol = 3, scales = "free") + labs(title = "Values") +  
  labs(title = "Distribution Between Neighborhood Crime Rates ",
       y = "", 
       fill = "Crime Rate") + 
    theme_minimal()

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

The rad variable contains many observations with an index of 24 all belonging to the neighborhoods whose crime index is above 24. We also see similar distributions in the tax, ptratio, and indus variables. A large portion of observations in at the higher end of the range of values are in neighborhoods whose crime rate is above the median.

crime_train_data %>% mutate(target_renamed = ifelse(target == 0, "Below", "Above"),
                            rad_cat = factor(ifelse(rad < 13, 0, 1))) %>% 
  ggplot(aes(x = as_factor(target_renamed), fill = rad_cat)) + geom_bar(position = "stack") + 
  labs(title = "Neighborhoods Above or Below 'rad' Index of 13 each Target Category",
       y = "Count",
       x = "Crime Rate",
       fill="rad index") + 
    theme_minimal()

The zn variable contains mostly zeros indicating that the proportion of residential land zoned for large lots (over 25,000 square feet) is 0. This continuous variable can be discretized to include 2 categories - 0 or above 0

crime_train_data %>% mutate(target_renamed = ifelse(target == 0, "Below", "Above"),
                            zn_cat = factor(ifelse(zn == 0, 0, 1))) %>% 
  ggplot(aes(x = as_factor(target_renamed), fill = zn_cat)) + geom_bar(position = "stack") + 
  labs(title = "Number of Neighborhoods with No Land Zoned for Large Lot Spaces",
       y = "Count",
       x = "Crime Rate",
       fill="Large Lot Spaces") + 
    theme_minimal()

library(corrplot)

## corrplot 0.95 loaded

cor_coef <- cor(crime_train_data[-c(3, 13)])

corrplot::corrplot(cor_coef, method = "number", title = "Correlations Bewteen Continuous Predictors", 
                  order = "hclust", number.cex = 0.75, type = "lower")

DATA PREPARATION

Code categorical variables into factor variables

# chas
crime_train_data$chas <- as_factor(crime_train_data$chas)
#target
crime_train_data$target <- as_factor(crime_train_data$target)

Assess distribution of the scaled and Center the continuous predictor variables

# Scaling (centered value/sd) is performed after centering (x - mean)

scaled_data <- crime_train_data[-c(3, 13)] %>% scale(center = TRUE, scale = TRUE) %>% as_tibble()

cbind(scaled_data, crime_train_data[13]) %>% 
  mutate(target_renamed = ifelse(target == 0, "Below", "Above")) %>%
  pivot_longer(cols = columns[-length(columns)], names_to = "Feature", values_to = "Value") %>% 
  ggplot(aes(x = Value, fill = as_factor(target_renamed))) + geom_histogram() + 
  facet_wrap(~ Feature, nrow = 4, ncol = 3, scales = "free") + labs(title = "Values") +  
  labs(title = "Distribution Between Neighborhood Crime Rates ",
       y = "", 
       fill = "Crime Rate") + 
    theme_minimal()

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Split and standardize data

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──

## ✔ broom        1.0.9     ✔ rsample      1.3.1
## ✔ dials        1.4.2     ✔ tailor       0.1.0
## ✔ infer        1.0.9     ✔ tune         2.0.0
## ✔ modeldata    1.5.1     ✔ workflows    1.3.0
## ✔ parsnip      1.3.3     ✔ workflowsets 1.1.1
## ✔ recipes      1.3.1     ✔ yardstick    1.3.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()

set.seed(100)

# Split data into train and test sets 
train_test_split <- initial_split(crime_train_data, prop = 0.7)

## TRAIN SET
train_df <- training(train_test_split)

# Predictors
x_train <- dplyr::select(train_df, -target)

# Center and scale predictors 

# Identify numeric columns
numeric_cols <- sapply(x_train, is.numeric)
# Apply scale() only to numeric columns
scaled_df <- as.data.frame(lapply(x_train[, numeric_cols], scale))
# You can then combine it with the non-numeric columns if needed
X_train_scaled <- cbind(x_train[, !numeric_cols], scaled_df)


# Target
y_train <- train_df$target

training_data <- data.frame(target = y_train, X_train_scaled)

## TEST SET
test_df  <- testing(train_test_split)

# Predictors
x_test  <- dplyr::select(test_df,  -target)

# Center and scale predictors 
# Identify numeric columns
numeric_cols <- sapply(x_test, is.numeric)
# Apply scale() only to numeric columns
scaled_df_test <- as.data.frame(lapply(x_test[, numeric_cols], scale))
# You can then combine it with the non-numeric columns if needed
X_test_scaled <- cbind(x_test[, !numeric_cols], scaled_df_test)

# Target
y_test  <- test_df$target

testing_data <- data.frame(target = y_test, X_test_scaled)

BUILD MODELS

Base model: All variables

logreg1 <- glm(target ~ ., family=binomial, data = training_data)
summary(logreg1)

## 
## Call:
## glm(formula = target ~ ., family = binomial, data = training_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.1033     0.7543   2.788  0.00530 ** 
## chas1         1.0266     1.0516   0.976  0.32900    
## zn           -1.6844     0.9806  -1.718  0.08583 .  
## indus        -0.2092     0.3931  -0.532  0.59449    
## nox           5.6163     1.0983   5.114 3.16e-07 ***
## rm           -0.4921     0.5748  -0.856  0.39194    
## age           0.7629     0.5015   1.521  0.12821    
## dis           1.5354     0.5560   2.761  0.00575 ** 
## rad           6.6575     1.6747   3.975 7.03e-05 ***
## tax          -1.2196     0.5497  -2.219  0.02652 *  
## ptratio       0.9799     0.3226   3.038  0.00238 ** 
## lstat         0.4185     0.4305   0.972  0.33100    
## medv          1.8747     0.7477   2.507  0.01217 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 450.70  on 325  degrees of freedom
## Residual deviance: 136.83  on 313  degrees of freedom
## AIC: 162.83
## 
## Number of Fisher Scoring iterations: 9

Model Diagnostics

logreg_metrics <- function(cm){
  
  TN <- cm[1,1]
  TP <- cm[2,2]
  FN <- cm[2,1]
  FP <- cm[1,2]
  
  acc = (TP + TN) / sum(cm)
  er = (FP + FN) / sum(cm)
  pre = TP / (TP + FP)
  sens = TP / (TP + FN)
  spec = TN / (TN + FP)
  f1 = (2 * pre * sens) / (pre + sens)
  
  metrics <- tibble(Metric = c("Accuracy", "Error_rate", "Precision", 
                               "Sensitivity", "Specificity", "F1_score"), 
                    Value = round(c(acc,er,pre,sens,spec,f1), 3))
  
  metrics
}

# Generate predictions using testing data
pred1_probs <- predict(logreg1, testing_data, type = 'response')
# Get the predicted labels
pred1 <- ifelse(pred1_probs >= 0.5, 1, 0)
# COnstruct confusion matrix
cm1 <- table(actual = testing_data$target, prediction = pred1)

metrics1 <- logreg_metrics(cm1)
metrics1

## # A tibble: 6 × 2
##   Metric      Value
##   <chr>       <dbl>
## 1 Accuracy    0.886
## 2 Error_rate  0.114
## 3 Precision   1    
## 4 Sensitivity 0.789
## 5 Specificity 1    
## 6 F1_score    0.882

# Create the plot
ggplot(data = data.frame(cm1), aes(x = actual, y = prediction, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = 0.5, fontface = "bold") +
  # Customize colors
  scale_fill_gradient(low = "lightblue", high = "darkblue") + 
  labs(title = "Confusion Matrix", x = "Actual", y = "Predicted") +
  theme_minimal()

Model 2

Improvements: Variable Selection and Varaince Inflation Factor

Several of the variables are highly correlated with each other. This multicollinearity may effect the true estimates of the impact each predictor has on correctly identifying if a households crime rate is above or below the median value. To identify variables with high multicollinearity we can assess their variance inflation factor (VIF).

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

# Write a function that takes a model, calculates VIF and plots it on a bar chart

plot_vif <- function(fitted_model, model_name){
  # Calculate VIF
  vif_values <- vif(fitted_model)
  
  # Convert VIF results to a data frame for plotting
  vif_df <- data.frame(Variable = names(vif_values), VIF = vif_values)
  
  # Set a threshold to indicate high VIF
  high_vif_threshold <- 5
  
  # Generate bar plot 
  ggplot(vif_df, aes(Variable, VIF)) + 
    geom_bar(stat = "identity", fill = "steelblue") + 
    geom_hline(yintercept = high_vif_threshold, linetype = "dashed", color = "red") + 
    scale_y_continuous(limits = c(0, max(vif_df$VIF) + 1)) + 
    labs(title = paste0("Variance Inflation Factor (VIF) for Regression Model - ", model_name), 
         y = "VIF",
         x = "Variable") + 
    theme_minimal() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
}

plot_vif(logreg1, "Log Reg 1")

Since medv and rm have the highest VIF and room was not significant as a predictor lets try dropping that variable and check the results. Before fitting another model, lets also include an interaction effect between tax and rad since both of these variables are highly correlated with each other.

crime_train_data %>% ggplot(aes(x = rad, y = tax, color = target)) + geom_point() + 
  labs(title = "Tax vs. Rad") + 
  theme_minimal()

We see again see some clear outliers in this plot, which was noted in both the box plots and histograms of each variable.

logreg2 <- update(logreg1, .~. - rm + tax:rad)
summary(logreg2)

## 
## Call:
## glm(formula = target ~ chas + zn + indus + nox + age + dis + 
##     rad + tax + ptratio + lstat + medv + rad:tax, family = binomial, 
##     data = training_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.6366     0.6107   2.680  0.00737 ** 
## chas1         0.9217     1.0870   0.848  0.39646    
## zn           -2.0505     0.9886  -2.074  0.03808 *  
## indus        -0.1009     0.3979  -0.254  0.79978    
## nox           5.3795     1.0601   5.075 3.88e-07 ***
## age           0.5990     0.4487   1.335  0.18191    
## dis           1.4057     0.5330   2.637  0.00836 ** 
## rad           5.9667     1.3779   4.330 1.49e-05 ***
## tax          -2.2883     0.8486  -2.697  0.00700 ** 
## ptratio       0.8643     0.2867   3.015  0.00257 ** 
## lstat         0.5529     0.4038   1.369  0.17091    
## medv          1.4174     0.4650   3.048  0.00230 ** 
## rad:tax      -1.7275     0.9245  -1.869  0.06168 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 450.70  on 325  degrees of freedom
## Residual deviance: 136.05  on 313  degrees of freedom
## AIC: 162.05
## 
## Number of Fisher Scoring iterations: 9

# Generate predictions using testing data
pred2_probs <- predict(logreg2, testing_data, type = 'response')
# Get the predicted labels
pred2 <- ifelse(pred2_probs >= 0.5, 1, 0)
# Construct confusion matrix
cm2 <- table(actual = testing_data$target, prediction = pred2)
# Calculate metrics
metrics2 <- logreg_metrics(cm2)
metrics2

## # A tibble: 6 × 2
##   Metric      Value
##   <chr>       <dbl>
## 1 Accuracy    0.9  
## 2 Error_rate  0.1  
## 3 Precision   1    
## 4 Sensitivity 0.816
## 5 Specificity 1    
## 6 F1_score    0.899

plot_vif(logreg2, "Log Reg 2")

## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif

Model 3

Variable Selection: Subset Technique

Using the second model logreg2 to generate a subset of predictors

library(ISLR2)
library(leaps)

log_subset <- regsubsets(logreg2$formula, data = training_data, nvmax = 12)

subset_summary <- summary(log_subset)
subset_summary

## Subset selection object
## Call: regsubsets.formula(logreg2$formula, data = training_data, nvmax = 12)
## 12 Variables  (and intercept)
##         Forced in Forced out
## chas1       FALSE      FALSE
## zn          FALSE      FALSE
## indus       FALSE      FALSE
## nox         FALSE      FALSE
## age         FALSE      FALSE
## dis         FALSE      FALSE
## rad         FALSE      FALSE
## tax         FALSE      FALSE
## ptratio     FALSE      FALSE
## lstat       FALSE      FALSE
## medv        FALSE      FALSE
## rad:tax     FALSE      FALSE
## 1 subsets of each size up to 12
## Selection Algorithm: exhaustive
##           chas1 zn  indus nox age dis rad tax ptratio lstat medv rad:tax
## 1  ( 1 )  " "   " " " "   "*" " " " " " " " " " "     " "   " "  " "    
## 2  ( 1 )  " "   " " " "   "*" " " " " "*" " " " "     " "   " "  " "    
## 3  ( 1 )  " "   " " " "   "*" "*" " " "*" " " " "     " "   " "  " "    
## 4  ( 1 )  " "   " " " "   "*" "*" " " "*" " " " "     " "   "*"  " "    
## 5  ( 1 )  " "   " " " "   "*" "*" " " "*" " " " "     " "   "*"  "*"    
## 6  ( 1 )  " "   " " " "   "*" "*" " " "*" "*" " "     " "   "*"  "*"    
## 7  ( 1 )  " "   " " " "   "*" "*" " " "*" "*" "*"     " "   "*"  "*"    
## 8  ( 1 )  " "   " " " "   "*" "*" " " "*" "*" "*"     "*"   "*"  "*"    
## 9  ( 1 )  " "   " " "*"   "*" "*" " " "*" "*" "*"     "*"   "*"  "*"    
## 10  ( 1 ) " "   " " "*"   "*" "*" "*" "*" "*" "*"     "*"   "*"  "*"    
## 11  ( 1 ) "*"   " " "*"   "*" "*" "*" "*" "*" "*"     "*"   "*"  "*"    
## 12  ( 1 ) "*"   "*" "*"   "*" "*" "*" "*" "*" "*"     "*"   "*"  "*"

subset_summary$bic

##  [1] -237.8039 -257.6434 -269.2390 -273.6635 -273.3228 -273.0374 -270.8119
##  [8] -267.3497 -262.9127 -257.7181 -251.9647 -246.1937

logreg3 <- update(logreg2, .~. - chas - zn - indus - dis - tax - ptratio - lstat - rad:tax)
summary(logreg3)

## 
## Call:
## glm(formula = target ~ nox + age + rad + medv, family = binomial, 
##     data = training_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.7002     0.6000   2.833 0.004604 ** 
## nox           2.6793     0.5674   4.722 2.34e-06 ***
## age           0.5144     0.3559   1.445 0.148434    
## rad           3.9611     1.0939   3.621 0.000293 ***
## medv          0.4070     0.2566   1.586 0.112678    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 450.70  on 325  degrees of freedom
## Residual deviance: 169.01  on 321  degrees of freedom
## AIC: 179.01
## 
## Number of Fisher Scoring iterations: 8

# Generate predictions using testing data
pred3_probs <- predict(logreg3, testing_data, type = 'response')
# Get the predicted labels
pred3 <- ifelse(pred3_probs >= 0.5, 1, 0)
# Construct confusion matrix
cm3 <- table(actual = testing_data$target, prediction = pred3)
# Calculate metrics
metrics3 <- logreg_metrics(cm3)
metrics3

## # A tibble: 6 × 2
##   Metric      Value
##   <chr>       <dbl>
## 1 Accuracy    0.879
## 2 Error_rate  0.121
## 3 Precision   1    
## 4 Sensitivity 0.776
## 5 Specificity 1    
## 6 F1_score    0.874

Model 4

Variable Selection: Backwards Elimination

backwards_logreg <- stats::step(logreg2, direction = "backward", k = log(nrow(training_data))) # USE BIC

## Start:  AIC=211.28
## target ~ chas + zn + indus + nox + age + dis + rad + tax + ptratio + 
##     lstat + medv + rad:tax
## 
##           Df Deviance    AIC
## - indus    1   136.11 205.55
## - chas     1   136.78 206.23
## - rad:tax  1   137.57 207.01
## - lstat    1   137.88 207.33
## - age      1   138.00 207.45
## <none>         136.05 211.28
## - zn       1   141.95 211.39
## - dis      1   143.75 213.19
## - ptratio  1   145.97 215.42
## - medv     1   146.91 216.35
## - nox      1   182.49 251.93
## 
## Step:  AIC=205.55
## target ~ chas + zn + nox + age + dis + rad + tax + ptratio + 
##     lstat + medv + rad:tax
## 
##           Df Deviance    AIC
## - chas     1   136.79 200.44
## - rad:tax  1   137.78 201.44
## - lstat    1   137.91 201.57
## - age      1   138.15 201.80
## <none>         136.11 205.55
## - zn       1   142.22 205.88
## - dis      1   143.80 207.46
## - ptratio  1   145.99 209.64
## - medv     1   147.00 210.65
## - nox      1   190.08 253.74
## 
## Step:  AIC=200.44
## target ~ zn + nox + age + dis + rad + tax + ptratio + lstat + 
##     medv + rad:tax
## 
##           Df Deviance    AIC
## - rad:tax  1   138.68 196.54
## - lstat    1   138.96 196.83
## - age      1   139.18 197.05
## <none>         136.79 200.44
## - zn       1   143.62 201.49
## - dis      1   144.25 202.12
## - ptratio  1   146.20 204.07
## - medv     1   148.19 206.06
## - nox      1   190.16 248.03
## 
## Step:  AIC=196.54
## target ~ zn + nox + age + dis + rad + tax + ptratio + lstat + 
##     medv
## 
##           Df Deviance    AIC
## - lstat    1   140.74 192.82
## - age      1   141.03 193.11
## - zn       1   144.45 196.53
## <none>         138.68 196.54
## - dis      1   146.42 198.50
## - ptratio  1   148.02 200.10
## - tax      1   148.40 200.49
## - medv     1   150.09 202.17
## - rad      1   180.80 232.88
## - nox      1   191.58 243.66
## 
## Step:  AIC=192.82
## target ~ zn + nox + age + dis + rad + tax + ptratio + medv
## 
##           Df Deviance    AIC
## - age      1   145.39 191.69
## - zn       1   146.01 192.31
## <none>         140.74 192.82
## - dis      1   148.77 195.07
## - ptratio  1   149.57 195.87
## - tax      1   149.80 196.10
## - medv     1   150.23 196.53
## - rad      1   182.87 229.17
## - nox      1   194.12 240.41
## 
## Step:  AIC=191.69
## target ~ zn + nox + dis + rad + tax + ptratio + medv
## 
##           Df Deviance    AIC
## - dis      1   150.45 190.96
## - zn       1   150.53 191.03
## <none>         145.39 191.69
## - medv     1   151.66 192.17
## - ptratio  1   152.55 193.06
## - tax      1   154.81 195.32
## - rad      1   186.56 227.06
## - nox      1   215.84 256.35
## 
## Step:  AIC=190.96
## target ~ zn + nox + rad + tax + ptratio + medv
## 
##           Df Deviance    AIC
## - zn       1   152.76 187.49
## - medv     1   153.88 188.60
## <none>         150.45 190.96
## - ptratio  1   157.22 191.94
## - tax      1   161.97 196.69
## - rad      1   194.91 229.63
## - nox      1   245.19 279.91
## 
## Step:  AIC=187.49
## target ~ nox + rad + tax + ptratio + medv
## 
##           Df Deviance    AIC
## - medv     1   155.64 184.58
## <none>         152.76 187.49
## - ptratio  1   160.99 189.92
## - tax      1   164.36 193.30
## - rad      1   197.13 226.07
## - nox      1   288.81 317.75
## 
## Step:  AIC=184.58
## target ~ nox + rad + tax + ptratio
## 
##           Df Deviance    AIC
## - ptratio  1   161.15 184.30
## <none>         155.64 184.58
## - tax      1   170.66 193.81
## - rad      1   206.16 229.30
## - nox      1   288.82 311.97
## 
## Step:  AIC=184.3
## target ~ nox + rad + tax
## 
##        Df Deviance    AIC
## <none>      161.15 184.30
## - tax   1   172.97 190.33
## - rad   1   206.66 224.02
## - nox   1   289.13 306.50

backwards_logreg$anova

##         Step Df   Deviance Resid. Df Resid. Dev      AIC
## 1            NA         NA       313   136.0460 211.2757
## 2    - indus  1 0.06478595       314   136.1108 205.5535
## 3     - chas  1 0.67779324       315   136.7886 200.4444
## 4  - rad:tax  1 1.88694734       316   138.6755 196.5445
## 5    - lstat  1 2.06371065       317   140.7392 192.8213
## 6      - age  1 4.65344063       318   145.3927 191.6878
## 7      - dis  1 5.05852459       319   150.4512 190.9595
## 8       - zn  1 2.31281488       320   152.7640 187.4854
## 9     - medv  1 2.87792043       321   155.6419 184.5764
## 10 - ptratio  1 5.50989614       322   161.1518 184.2994

backwards_logreg$formula

## target ~ nox + rad + tax

Fit a new regression with the resulting predictors from backwards elimination. We can update the logreg2 to only include only those predictors.

logreg4 <- glm(target ~ nox + rad + tax, family = "binomial", data = training_data)

summary(logreg4)

## 
## Call:
## glm(formula = target ~ nox + rad + tax, family = "binomial", 
##     data = training_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.2139     0.5861   3.777 0.000159 ***
## nox           4.1979     0.6162   6.812 9.62e-12 ***
## rad           5.5489     1.1788   4.707 2.51e-06 ***
## tax          -1.3675     0.4462  -3.064 0.002180 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 450.70  on 325  degrees of freedom
## Residual deviance: 161.15  on 322  degrees of freedom
## AIC: 169.15
## 
## Number of Fisher Scoring iterations: 8

BIC(logreg4)

## [1] 184.2994

# Generate predictions using testing data
pred4_probs <- predict(logreg4, testing_data, type = 'response')
# Get the predicted labels
pred4 <- ifelse(pred4_probs >= 0.5, 1, 0)
# Construct confusion matrix
cm4 <- table(actual = testing_data$target, prediction = pred4)
# Calculate metrics
metrics4 <- logreg_metrics(cm4)
metrics4

## # A tibble: 6 × 2
##   Metric      Value
##   <chr>       <dbl>
## 1 Accuracy    0.871
## 2 Error_rate  0.129
## 3 Precision   1    
## 4 Sensitivity 0.763
## 5 Specificity 1    
## 6 F1_score    0.866

anova(logreg2,logreg4)

## Analysis of Deviance Table
## 
## Model 1: target ~ chas + zn + indus + nox + age + dis + rad + tax + ptratio + 
##     lstat + medv + rad:tax
## Model 2: target ~ nox + rad + tax
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
## 1       313     136.05                        
## 2       322     161.15 -9  -25.106 0.002856 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#left_join(logreg_metrics(confusion_matrix4), logreg_metrics(confusion_matrix3), by = "Metric", suffix = c("4","3"))

MODEL SELECTION

Final model is the logreg3 since it is a simpler model. Despite having a higher residual deviance score when compared to logreg4, it performed exactly the same on unseen data (testing_data).

# Create the plot
ggplot(data = data.frame(cm2), aes(x = actual, y = prediction, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = 0.5, fontface = "bold") +
  # Customize colors
  scale_fill_gradient(low = "red", high = "lightblue") + 
  labs(title = "Confusion Matrix: LogReg2", x = "Actual", y = "Predicted") +
  theme_minimal()

tibble(Metrics = c("Accuracy", "Error Rate", "Precision","Sensitivity", "Specificity", "Sensitivity", "F1_score", "Resid. Deviance"), 
       )

## # A tibble: 8 × 1
##   Metrics        
##   <chr>          
## 1 Accuracy       
## 2 Error Rate     
## 3 Precision      
## 4 Sensitivity    
## 5 Specificity    
## 6 Sensitivity    
## 7 F1_score       
## 8 Resid. Deviance

df12 <- merge(metrics1, metrics2, by = "Metric", suffixes = c("1", "2"))
df34 <- merge(metrics3,metrics4, by = "Metric", suffixes = c("3", "4")) 
summary_df <- left_join(df12, df34, by = "Metric")
colnames(summary_df) <- c("Metric", "logreg1", "logreg2", "logreg3", "logreg4")

summary_df

##        Metric logreg1 logreg2 logreg3 logreg4
## 1    Accuracy   0.886   0.900   0.879   0.871
## 2  Error_rate   0.114   0.100   0.121   0.129
## 3    F1_score   0.882   0.899   0.874   0.866
## 4   Precision   1.000   1.000   1.000   1.000
## 5 Sensitivity   0.789   0.816   0.776   0.763
## 6 Specificity   1.000   1.000   1.000   1.000

deviance <- c("Deviance", round(logreg1$deviance,2), round(logreg2$deviance,2), round(logreg3$deviance,2), round(logreg4$deviance,2))
full_summary_df <- rbind(summary_df, deviance)
full_summary_df

##        Metric logreg1 logreg2 logreg3 logreg4
## 1    Accuracy   0.886     0.9   0.879   0.871
## 2  Error_rate   0.114     0.1   0.121   0.129
## 3    F1_score   0.882   0.899   0.874   0.866
## 4   Precision       1       1       1       1
## 5 Sensitivity   0.789   0.816   0.776   0.763
## 6 Specificity       1       1       1       1
## 7    Deviance  136.83  136.05  169.01  161.15

write.csv(full_summary_df, "summary_df.csv", row.names = FALSE)

# Center and scale the data 

head(crime_eval_data)

## # A tibble: 6 × 12
##      zn indus  chas   nox    rm   age   dis   rad   tax ptratio lstat  medv
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>
## 1     0  7.07     0 0.469  7.18  61.1  4.97     2   242    17.8  4.03  34.7
## 2     0  8.14     0 0.538  6.10  84.5  4.46     4   307    21   10.3   18.2
## 3     0  8.14     0 0.538  6.50  94.4  4.45     4   307    21   12.8   18.4
## 4     0  8.14     0 0.538  5.95  82    3.99     4   307    21   27.7   13.2
## 5     0  5.96     0 0.499  5.85  41.5  3.93     5   279    19.2  8.77  21  
## 6    25  5.13     0 0.453  5.74  66.2  7.23     8   284    19.7 13.2   18.7

# convert variables factor type

crime_eval_data$chas <- as_factor(crime_eval_data$chas)


scaled_features <- scale(crime_eval_data[-3])

eval_data_prepared <- cbind(scaled_features, crime_eval_data[3])
eval_data_prepared <- eval_data_prepared %>% dplyr::select(c(chas, zn, indus, nox, age, dis, rad, tax, ptratio, lstat, medv))


# Generate predictions using testing data
pred_eval_probs <- predict(logreg2, eval_data_prepared, type = 'response')
# Get the predicted labels
pred_eval <- ifelse(pred_eval_probs >= 0.5, 1, 0)

crime_eval_data$pred_probs <- pred_eval_probs
crime_eval_data$pred_labels <- pred_eval

write.csv(crime_eval_data, "crime_eval_data_predictions.csv", row.names = FALSE)

Homework 3: Data 621

Dirk Hartog

2025-10-15

HOMEWORK 3: LOGISTIC REGRESSION