Load Libraries

# Load required packages
library(htmltools)

## Warning: package 'htmltools' was built under R version 4.3.3

library(caret)

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.3.3

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 4.3.3

library(pROC)

## Warning: package 'pROC' was built under R version 4.3.3

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(tidyverse)

## Warning: package 'purrr' was built under R version 4.3.3

## Warning: package 'dplyr' was built under R version 4.3.3

## Warning: package 'lubridate' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(dplyr)
library(corrplot)

## Warning: package 'corrplot' was built under R version 4.3.3

## corrplot 0.95 loaded

library(skimr)

## Warning: package 'skimr' was built under R version 4.3.3

require(DataExplorer)

## Loading required package: DataExplorer

## Warning: package 'DataExplorer' was built under R version 4.3.3

require(miscTools)

## Loading required package: miscTools

## Warning: package 'miscTools' was built under R version 4.3.3

require(MASS)

## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

require(performance)

## Loading required package: performance

## Warning: package 'performance' was built under R version 4.3.3

require(lmtest)

## Loading required package: lmtest

## Warning: package 'lmtest' was built under R version 4.3.3

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 4.3.3

## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

require(mice)

## Loading required package: mice

## Warning: package 'mice' was built under R version 4.3.3

## Warning in check_dep_version(): ABI version mismatch: 
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package

## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind

require(glmnet)

## Loading required package: glmnet

## Warning: package 'glmnet' was built under R version 4.3.3

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8

require(Metrics)

## Loading required package: Metrics

## Warning: package 'Metrics' was built under R version 4.3.3

## 
## Attaching package: 'Metrics'
## 
## The following objects are masked from 'package:performance':
## 
##     mae, mse, rmse
## 
## The following object is masked from 'package:pROC':
## 
##     auc
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall

Variables in the Dataset:

• zn: proportion of residential land zoned for large lots (over 25000 square feet) (predictor variable)
• indus: proportion of non-retail business acres per suburb (predictor variable)
• chas: a dummy var. for whether the suburb borders the Charles River (1) or not (0) (predictor variable)
• nox: nitrogen oxides concentration (parts per 10 million) (predictor variable)
• rm: average number of rooms per dwelling (predictor variable)
• age: proportion of owner-occupied units built prior to 1940 (predictor variable)
• dis: weighted mean of distances to five Boston employment centers (predictor variable)
• rad: index of accessibility to radial highways (predictor variable)
• tax: full-value property-tax rate per $10,000 (predictor variable)
• ptratio: pupil-teacher ratio by town (predictor variable)
• lstat: lower status of the population (percent) (predictor variable)
• medv: median value of owner-occupied homes in $1000s (predictor variable)
• target: whether the crime rate is above the median crime rate (1) or not (0) (response variable)

load the dataset and understand its structure.

crime_training_df <- read_csv("https://raw.githubusercontent.com/uzmabb182/Data_621/refs/heads/main/Assignment3/crime-training-data_modified.csv")

## Rows: 466 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (13): zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, lstat, medv...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

crime_evaluation_df <- read_csv("https://raw.githubusercontent.com/uzmabb182/Data_621/refs/heads/main/Assignment3/crime-evaluation-data_modified.csv")

## Rows: 40 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (12): zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, lstat, medv
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(crime_evaluation_df)

## # A tibble: 6 × 12
##      zn indus  chas   nox    rm   age   dis   rad   tax ptratio lstat  medv
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>
## 1     0  7.07     0 0.469  7.18  61.1  4.97     2   242    17.8  4.03  34.7
## 2     0  8.14     0 0.538  6.10  84.5  4.46     4   307    21   10.3   18.2
## 3     0  8.14     0 0.538  6.50  94.4  4.45     4   307    21   12.8   18.4
## 4     0  8.14     0 0.538  5.95  82    3.99     4   307    21   27.7   13.2
## 5     0  5.96     0 0.499  5.85  41.5  3.93     5   279    19.2  8.77  21  
## 6    25  5.13     0 0.453  5.74  66.2  7.23     8   284    19.7 13.2   18.7

Exploratory Data Analysis

#dim(crime_training_df)
skim(crime_training_df)

Data summary
Name	crime_training_df
Number of rows	466
Number of columns	13
_______________________
Column type frequency:
numeric	13
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
zn	1	11.58	23.36	0.00	0.00	0.00	16.25	100.00	▇▁▁▁▁
indus	1	11.11	6.85	0.46	5.15	9.69	18.10	27.74	▇▆▁▇▁
chas	1	0.07	0.26	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
nox	1	0.55	0.12	0.39	0.45	0.54	0.62	0.87	▇▇▅▃▁
rm	1	6.29	0.70	3.86	5.89	6.21	6.63	8.78	▁▂▇▂▁
age	1	68.37	28.32	2.90	43.88	77.15	94.10	100.00	▂▂▂▃▇
dis	1	3.80	2.11	1.13	2.10	3.19	5.21	12.13	▇▅▂▁▁
rad	1	9.53	8.69	1.00	4.00	5.00	24.00	24.00	▇▂▁▁▃
tax	1	409.50	167.90	187.00	281.00	334.50	666.00	711.00	▇▇▅▁▇
ptratio	1	18.40	2.20	12.60	16.90	18.90	20.20	22.00	▁▃▅▅▇
lstat	1	12.63	7.10	1.73	7.04	11.35	16.93	37.97	▇▇▅▂▁
medv	1	22.59	9.24	5.00	17.02	21.20	25.00	50.00	▂▇▅▁▁
target	1	0.49	0.50	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▇

Missing Values & Completeness Rate The n_missing column shows that there are no missing values (0) for any variable, meaning we don’t need to perform imputation. The complete_rate column confirms this, as all variables have a completeness rate of 1, meaning every row has a value for these variables.
Descriptive Statistics

Mean (mean): The average value of each variable. Standard Deviation (sd): Measures the spread of values. Minimum (p0): The lowest observed value (0th percentile). First Quartile (p25): The 25th percentile, where 25% of the values are below this number. Median (p50): The 50th percentile (the middle value). Third Quartile (p75): The 75th percentile, where 75% of the values are below this number.

Key Observations:

Crime Rate Target (target)

The median (p50) is 0, indicating that more than half of the data points fall in the low-crime category (target = 0).

Median Home Value (medv) Mean = 22.59 ($22,590 in $1000s), Median = 21.2. The range (p0 = 5, p75 = 25) suggests that most homes are valued between $5,000 and $25,000 (in $1000s). The standard deviation (9.23) indicates a relatively high spread in home values.

Lower Status Population (lstat) Mean = 12.63%, Median = 11.93%. A positively skewed distribution (p0 = 1.73, p75 = 16.93), meaning some areas have much higher lower-status populations than others.

Property Tax Rate (tax) High variance (Mean = 409.5, SD = 167.9). Large difference between the 25th percentile (281) and 75th percentile (666), suggesting significant variability in tax rates among neighborhoods.

Average Number of Rooms (rm) Mean = 6.29, Median = 6.21, with a relatively small spread (SD = 0.70). Indicates most homes have around 6 rooms.

Distance to Employment Centers (dis) Median = 3.19, but the 25th percentile is quite low (2.10), meaning some neighborhoods are much closer to employment centers than others. Higher standard deviation (2.1) suggests some neighborhoods are much more remote.

Industrial Land Proportion (indus) Mean = 11.10, Median = 9.69, and right-skewed distribution (p0 = 0.46, p75 = 18.1). Some areas have much higher proportions of industrial land, potentially influencing crime.

Highway Accessibility (rad) Highly right-skewed: The median is 5, but the 75th percentile is 24, meaning some neighborhoods have much greater access to highways than others. This might be an important predictor for crime.

Potential Data Transformations zn, indus, tax, rad, lstat, and medv show skewness, so applying a log transformation might improve normality. age can be categorized into bins (e.g., young, middle-aged, old) since it ranges from 2.9 to 94.1. dis has a wide range, so normalization might be needed.

Checking Binary Logistic Regression Assumptions

Before interpreting results from a binary logistic regression, we must verify three key assumptions:

Independence of Observations Linearity of the Logit No Multicollinearity

Now, let’s discuss how to check these assumptions and which correlation method is best for your data (which contains both ordinal and continuous variables).

Pearson Correlation, Spearman Rank Correlation, and Kendall’s Tau Rank Correlation are all methods used to measure the strength and direction of relationships between variables.

However, they differ in terms of their assumptions, use cases, and how they quantify relationships.

Pearson Correlation: Suitable for continuous data when you want to measure linear associations.

Spearman Rank Correlation: AAppropriate for both continuous and ordinal data. Particularly useful when the relationship is expected to be monotonic but not necessarily linear.

Kendall’s Tau Rank Correlation: Suitable for both continuous and ordinal (ranked) data. Useful when the data may not follow a linear relationship.

Final Choice: Spearman’s Rank Correlation

Since your data has both ordinal and continuous variables, Spearman’s correlation is the best choice because:

It does not assume normality (unlike Pearson). It can handle ordinal variables (like chas, rad). It is more robust against outliers than Pearson.

Checking Independence Assumption Using Spearman’s Correlation in Binary Logistic Regression

What is the Independence Assumption?

The independence assumption in binary logistic regression states that each observation (row) in the dataset should be independent of the others. This means:

No duplicated data points (e.g., same neighborhood appearing multiple times). No clustered observations (e.g., observations grouped by region, time, or other factors). No strong correlations between residuals of observations, meaning observations do not systematically affect each other.

Why Use Spearman’s Correlation?

Spearman’s correlation measures monotonic relationships between variables, making it suitable when we have a mix of ordinal and continuous predictors. If there is high correlation between observations, it suggests possible dependence (e.g., neighborhoods with similar crime rates).

Compute Spearman’s Correlation Between Observations

We calculate the Spearman correlation matrix for all numeric variables (excluding target), which tells us if some variables are highly dependent (correlated).

1. Independence (Visual Inspection)

library(ggplot2)
library(tidyr)
library(dplyr)

# Exclude categorical columns before pivoting
crime_long <- crime_training_df %>%
  dplyr::select(where(is.numeric)) %>%  # Keep only numeric columns
  pivot_longer(cols = -target, names_to = "Variable", values_to = "Value")

# Create scatter plots for each predictor vs target with a fitted line
ggplot(crime_long, aes(x = Value, y = target)) +
  geom_point(alpha = 0.5, color = "blue") +  # Scatter plot
  geom_smooth(method = "lm", color = "red", se = FALSE) +  # Linear fit
  facet_wrap(~Variable, scales = "free") +  # Multiple plots for each variable
  theme_minimal() +
  labs(title = "Independence Check: Scatter Plots of Predictors vs Target",
       x = "Predictor Value",
       y = "Crime Rate (Binary Target)")

## `geom_smooth()` using formula = 'y ~ x'

# Load necessary library
library(dplyr)

# Initialize an empty data frame to store results
independence_results <- data.frame(Variable = character(),
                                   Correlation = numeric(),
                                   P_Value = numeric(),
                                   stringsAsFactors = FALSE)

# Loop through each predictor variable (excluding the target)
for (var in colnames(crime_training_df)[colnames(crime_training_df) != "target"]) {
  
  # Ensure the variable is numeric before computing Spearman correlation
  if (is.numeric(crime_training_df[[var]])) {
    
    # Perform Spearman correlation test
    test_result <- cor.test(crime_training_df[[var]], crime_training_df$target, method = "spearman")
    
    # Store results in a data frame
    independence_results <- rbind(independence_results, 
                                  data.frame(Variable = var, 
                                             Correlation = test_result$estimate, 
                                             P_Value = test_result$p.value))
  }
}

## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties

# View results in tabular format
print(independence_results)

##       Variable Correlation      P_Value
## rho         zn -0.47299691 2.365331e-27
## rho1     indus  0.61915270 1.159150e-50
## rho2      chas  0.08004187 8.434811e-02
## rho3       nox  0.75471235 5.629874e-87
## rho4        rm -0.17719123 1.204401e-04
## rho5       age  0.64569520 2.544840e-56
## rho6       dis -0.65908410 2.147507e-59
## rho7       rad  0.57842556 5.796797e-43
## rho8       tax  0.59604354 3.693938e-46
## rho9   ptratio  0.35733899 1.756795e-15
## rho10    lstat  0.47946598 3.674288e-28
## rho11     medv -0.40154322 1.755130e-19

Checking for Independence:

In Spearman’s method, we check the correlation between:

Each independent variable and the dependent variable (target) If correlation is too low (|𝜌| < 0.1) and p-value > 0.05, the variable might not be useful in predicting the target.

Variables to Consider Removing:

chas (ρ = 0.0800, p = 0.0843) → No meaningful correlation with crime. Possibly rm (ρ = -0.1772, p = 0.00012) → Weak correlation but could check its importance in the model.

Variables to Keep (for now, but monitor multicollinearity):

nox (ρ = 0.7547) age (ρ = 0.6457) dis (ρ = -0.6591) indus (ρ = 0.6192) rad, tax, ptratio (moderate correlation) Transform / Create Interaction Terms:

Log transform: dis (since it has a strong negative correlation) Categorize: age into “Young”, “Middle-aged”, “Old” Interaction term: tax * rad (both impact crime)

# Load necessary library
library(corrplot)
library(dplyr)

crime_training_df <- crime_training_df %>%
  mutate(across(where(is.character), as.numeric))

crime_evaluation_df <- crime_evaluation_df %>%
  mutate(across(where(is.character), as.numeric))

# Verify again
str(crime_training_df)

## tibble [466 × 13] (S3: tbl_df/tbl/data.frame)
##  $ zn     : num [1:466] 0 0 0 30 0 0 0 0 0 80 ...
##  $ indus  : num [1:466] 19.58 19.58 18.1 4.93 2.46 ...
##  $ chas   : num [1:466] 0 1 0 0 0 0 0 0 0 0 ...
##  $ nox    : num [1:466] 0.605 0.871 0.74 0.428 0.488 0.52 0.693 0.693 0.515 0.392 ...
##  $ rm     : num [1:466] 7.93 5.4 6.49 6.39 7.16 ...
##  $ age    : num [1:466] 96.2 100 100 7.8 92.2 71.3 100 100 38.1 19.1 ...
##  $ dis    : num [1:466] 2.05 1.32 1.98 7.04 2.7 ...
##  $ rad    : num [1:466] 5 5 24 6 3 5 24 24 5 1 ...
##  $ tax    : num [1:466] 403 403 666 300 193 384 666 666 224 315 ...
##  $ ptratio: num [1:466] 14.7 14.7 20.2 16.6 17.8 20.9 20.2 20.2 20.2 16.4 ...
##  $ lstat  : num [1:466] 3.7 26.82 18.85 5.19 4.82 ...
##  $ medv   : num [1:466] 50 13.4 15.4 23.7 37.9 26.5 5 7 22.2 20.9 ...
##  $ target : num [1:466] 1 1 1 0 0 0 1 1 0 0 ...

###  Visualize Correlation Matrix

# Compute Spearman correlation matrix
numeric_data <- crime_training_df %>% dplyr::select(where(is.numeric), -target)
spearman_cor <- cor(numeric_data, method = "spearman", use = "pairwise.complete.obs")

# Plot the Spearman correlation matrix with labels
corrplot(spearman_cor, 
         method = "color",         # Color-coded correlation plot
         type = "upper",           # Show only upper triangle
         tl.cex = 0.8,             # Adjust text size of axis labels
         addCoef.col = "black",    # Add correlation values in black
         number.cex = 0.7,         # Adjust correlation label size
         col = colorRampPalette(c("blue", "white", "red"))(200) # Color gradient
)

### Checking for Multicollinearity (High Correlation Between Predictors) A general rule of thumb is that if |𝜌| > 0.7, it indicates strong correlation between variables, which can lead to multicollinearity in the logistic regression model.

From the matrix: indus & nox (𝜌 = 0.79) → Strong positive correlation, meaning they provide redundant information. tax & rad (𝜌 = 0.70) → These variables are highly correlated, indicating one may be removed. lstat & medv (𝜌 = -0.85) → Very strong negative correlation; keeping both might be problematic. log_medv & medv (𝜌 = 1.00) → Perfect correlation (since log transformation was applied), meaning one must be removed to prevent redundancy.

Final Takeaways High correlation between predictors (|𝜌| > 0.7) indicates potential multicollinearity. Consider removing indus, rad, or medv to improve model stability. No evidence of entire rows/columns being highly correlated, suggesting no major independence violations. Use VIF for confirmation and decide on variable selection accordingly.

Checking Multicollinearity Using Variance Inflation Factor (VIF) in R

Variance Inflation Factor (VIF) helps quantify multicollinearity by measuring how much the variance of regression coefficients is inflated due to correlation among predictors. A VIF > 5 (or more conservatively, VIF > 10) suggests severe multicollinearity.

Check VIF (Variance Inflation Factor)

VIF helps confirm whether these high correlations affect regression coefficients.

library(car)

## Warning: package 'car' was built under R version 4.3.3

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

vif(glm(target ~ nox + age + rad + tax + dis + zn + medv, family = binomial, data = crime_training_df))

##      nox      age      rad      tax      dis       zn     medv 
## 2.859311 1.548306 1.421341 1.766596 3.576500 1.667181 1.784929

Conclusion There is NO severe multicollinearity (all VIF values are below 5). No immediate need to drop variables based on VIF. The variable dis (VIF = 3.58) shows moderate correlation with other predictors, but it’s not problematic.

Next Steps:

Keep all predictors in the model. If we suspect redundancy, check pairwise correlations again or test removing dis to see if model performance improves.

Checking the Assumption of Linearity of the Logit for Binary Logistic Regression

In logistic regression, we assume that each continuous predictor has a linear relationship with the log-odds (logit) of the target variable. If this assumption is violated, the model may be misleading or inaccurate.

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(tidyr)

# Fit the logistic regression model
model <- glm(target ~ nox + age + rad + tax + dis + zn + medv, 
             family = binomial, data = crime_training_df)

# Compute predicted probabilities
crime_training_df$predicted_prob <- predict(model, type = "response")

# Compute logit (log-odds) transformation
crime_training_df$logit <- log(crime_training_df$predicted_prob / 
                              (1 - crime_training_df$predicted_prob))

# Reshape data for faceting
plot_data <- crime_training_df %>%
  dplyr::select(logit, nox, age, rad, tax, dis, zn, medv) %>%
  pivot_longer(cols = -logit, names_to = "Predictor", values_to = "Value")

# Create faceted scatter plot
ggplot(plot_data, aes(x = Value, y = logit)) +
  geom_point(alpha = 0.5, color = "blue") + 
  geom_smooth(method = "loess", color = "red", se = FALSE) + 
  facet_wrap(~ Predictor, scales = "free") +
  theme_minimal() +
  labs(title = "Linearity of Logit Check for Binary Logistic Regression",
       x = "Predictor Variable",
       y = "Logit of Predicted Probability")

## `geom_smooth()` using formula = 'y ~ x'

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at -0.5

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 13

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 156.25

### Interpretation of the Linearity of Logit Check for Binary Logistic Regression This faceted scatter plot assesses the assumption of linearity of the logit for binary logistic regression. The blue dots represent the relationship between each predictor variable and the logit of the predicted probability, while the red LOESS (Locally Estimated Scatterplot Smoothing) curve helps visualize patterns.

Overall Conclusion

Several predictors (e.g., dis, medv, tax, zn) violate the linearity of logit assumption.

Data Preparation

This step involves cleaning and transforming data to improve model performance.

Tasks: Handle missing values: If missing values exist, replace with the mean/median or use imputation methods.

Feature engineering: Log transformations: medv, lstat, and dis might benefit from log transformations to reduce skewness. Binning: Convert age into categorical buckets (e.g., young, middle-aged, old). Interactions: Create new features (e.g., lstat*medv to capture relationships between income and home values). Standardization: Normalize numerical variables to bring them to the same scale.

# Handle missing values (if any)
#crime_training_df$medv[is.na(crime_training_df$medv)] <- median(crime_training_df$medv, na.rm = TRUE)

# Fill missing values with median
crime_training_df$medv[is.na(crime_training_df$medv)] <- median(crime_training_df$medv, na.rm = TRUE)
crime_training_df$lstat[is.na(crime_training_df$lstat)] <- median(crime_training_df$lstat, na.rm = TRUE)
crime_training_df$dis[is.na(crime_training_df$dis)] <- median(crime_training_df$dis, na.rm = TRUE)

# Log transformation
#crime_training_df$log_medv <- log(crime_training_df$medv + 1)

crime_training_df <- crime_training_df %>%
  mutate(
    log_medv = log(medv + 1),  # Avoid log(0)
    log_lstat = log(lstat + 1),
    log_dis = log(dis + 1)
  )


# Standardization
#crime_training_df$zn <- scale(crime_training_df$zn)
#crime_training_df$indus <- scale(crime_training_df$indus)

#Standardization (Normalize Continuous Variables)
#Standardization ensures that variables are on the same scale, improving model stability.
crime_training_df <- crime_training_df %>%
  mutate(
    zn_scaled = scale(zn),
    indus_scaled = scale(indus),
    nox_scaled = scale(nox),
    rm_scaled = scale(rm),
    age_scaled = scale(age),
    dis_scaled = scale(dis),
    rad_scaled = scale(rad),
    tax_scaled = scale(tax),
    ptratio_scaled = scale(ptratio)
  )


# Create categorical age groups bins
crime_training_df$age_group <- cut(crime_training_df$age, breaks=c(0, 30, 60, 100), labels=c("Young", "Middle-aged", "Old"))

# Create Interaction Terms
crime_training_df <- crime_training_df %>%
  mutate(
    lstat_medv_interact = log_lstat * log_medv,  # Income & housing price interaction
    tax_rad_interact = tax * rad                # Tax burden & highway accessibility
  )

colnames(crime_training_df)

##  [1] "zn"                  "indus"               "chas"               
##  [4] "nox"                 "rm"                  "age"                
##  [7] "dis"                 "rad"                 "tax"                
## [10] "ptratio"             "lstat"               "medv"               
## [13] "target"              "predicted_prob"      "logit"              
## [16] "log_medv"            "log_lstat"           "log_dis"            
## [19] "zn_scaled"           "indus_scaled"        "nox_scaled"         
## [22] "rm_scaled"           "age_scaled"          "dis_scaled"         
## [25] "rad_scaled"          "tax_scaled"          "ptratio_scaled"     
## [28] "age_group"           "lstat_medv_interact" "tax_rad_interact"

Building Models

Applying three logistic regression models using different variable selections or transformations.

Approach: Model 1 (Baseline Model): Use all predictors. Model 2 (Stepwise Selection): Use stepAIC() to select important variables. Model 3 (Transformed Variables): Use transformed and interaction variables.

# Load library
library(MASS)

# Logistic Regression Model 1 (Baseline)
model1 <- glm(target ~ ., data=crime_training_df, family=binomial)

# Logistic Regression Model 2 (Stepwise Selection)
model2 <- stepAIC(glm(target ~ ., data=crime_training_df, family=binomial), direction="both")

## Start:  AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv + predicted_prob + logit + log_medv + 
##     log_lstat + log_dis + zn_scaled + indus_scaled + nox_scaled + 
##     rm_scaled + age_scaled + dis_scaled + rad_scaled + tax_scaled + 
##     ptratio_scaled + age_group + lstat_medv_interact + tax_rad_interact
## 
## 
## Step:  AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv + predicted_prob + logit + log_medv + 
##     log_lstat + log_dis + zn_scaled + indus_scaled + nox_scaled + 
##     rm_scaled + age_scaled + dis_scaled + rad_scaled + tax_scaled + 
##     age_group + lstat_medv_interact + tax_rad_interact
## 
## 
## Step:  AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv + predicted_prob + logit + log_medv + 
##     log_lstat + log_dis + zn_scaled + indus_scaled + nox_scaled + 
##     rm_scaled + age_scaled + dis_scaled + rad_scaled + age_group + 
##     lstat_medv_interact + tax_rad_interact
## 
## 
## Step:  AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv + predicted_prob + logit + log_medv + 
##     log_lstat + log_dis + zn_scaled + indus_scaled + nox_scaled + 
##     rm_scaled + age_scaled + dis_scaled + age_group + lstat_medv_interact + 
##     tax_rad_interact
## 
## 
## Step:  AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv + predicted_prob + logit + log_medv + 
##     log_lstat + log_dis + zn_scaled + indus_scaled + nox_scaled + 
##     rm_scaled + age_scaled + age_group + lstat_medv_interact + 
##     tax_rad_interact
## 
## 
## Step:  AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv + predicted_prob + logit + log_medv + 
##     log_lstat + log_dis + zn_scaled + indus_scaled + nox_scaled + 
##     rm_scaled + age_group + lstat_medv_interact + tax_rad_interact
## 
## 
## Step:  AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv + predicted_prob + logit + log_medv + 
##     log_lstat + log_dis + zn_scaled + indus_scaled + nox_scaled + 
##     age_group + lstat_medv_interact + tax_rad_interact
## 
## 
## Step:  AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv + predicted_prob + logit + log_medv + 
##     log_lstat + log_dis + zn_scaled + indus_scaled + age_group + 
##     lstat_medv_interact + tax_rad_interact
## 
## 
## Step:  AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv + predicted_prob + logit + log_medv + 
##     log_lstat + log_dis + zn_scaled + age_group + lstat_medv_interact + 
##     tax_rad_interact
## 
## 
## Step:  AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv + predicted_prob + logit + log_medv + 
##     log_lstat + log_dis + age_group + lstat_medv_interact + tax_rad_interact
## 
## 
## Step:  AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + lstat + medv + predicted_prob + log_medv + log_lstat + 
##     log_dis + age_group + lstat_medv_interact + tax_rad_interact
## 
##                       Df Deviance    AIC
## - indus                1   173.99 213.99
## - lstat_medv_interact  1   174.01 214.01
## - log_medv             1   174.07 214.07
## - tax                  1   174.13 214.13
## - log_lstat            1   174.16 214.16
## - zn                   1   174.19 214.19
## - predicted_prob       1   174.22 214.22
## - chas                 1   174.31 214.31
## - age_group            2   176.44 214.44
## - tax_rad_interact     1   175.07 215.07
## - medv                 1   175.09 215.09
## - lstat                1   175.27 215.27
## - rm                   1   175.86 215.86
## <none>                     173.97 215.97
## - age                  1   176.11 216.11
## - rad                  1   177.98 217.98
## - dis                  1   180.14 220.14
## - log_dis              1   183.16 223.16
## - ptratio              1   186.62 226.62
## - nox                  1   189.02 229.02
## 
## Step:  AIC=213.99
## target ~ zn + chas + nox + rm + age + dis + rad + tax + ptratio + 
##     lstat + medv + predicted_prob + log_medv + log_lstat + log_dis + 
##     age_group + lstat_medv_interact + tax_rad_interact
## 
##                       Df Deviance    AIC
## - lstat_medv_interact  1   174.03 212.03
## - log_medv             1   174.08 212.08
## - tax                  1   174.14 212.14
## - log_lstat            1   174.17 212.17
## - zn                   1   174.22 212.22
## - predicted_prob       1   174.29 212.29
## - chas                 1   174.40 212.40
## - age_group            2   176.49 212.49
## - tax_rad_interact     1   175.07 213.07
## - medv                 1   175.09 213.09
## - lstat                1   175.27 213.27
## - rm                   1   175.90 213.90
## <none>                     173.99 213.99
## - age                  1   176.11 214.11
## + indus                1   173.97 215.97
## + indus_scaled         1   173.97 215.97
## - rad                  1   178.36 216.36
## - dis                  1   180.95 218.95
## - log_dis              1   185.17 223.17
## - ptratio              1   186.71 224.71
## - nox                  1   189.36 227.36
## 
## Step:  AIC=212.03
## target ~ zn + chas + nox + rm + age + dis + rad + tax + ptratio + 
##     lstat + medv + predicted_prob + log_medv + log_lstat + log_dis + 
##     age_group + tax_rad_interact
## 
##                       Df Deviance    AIC
## - tax                  1   174.22 210.22
## - log_medv             1   174.24 210.24
## - zn                   1   174.29 210.29
## - predicted_prob       1   174.31 210.31
## - chas                 1   174.44 210.44
## - age_group            2   176.65 210.65
## - tax_rad_interact     1   175.11 211.11
## - rm                   1   175.91 211.91
## - lstat                1   175.97 211.97
## <none>                     174.03 212.03
## - log_lstat            1   176.10 212.10
## - age                  1   176.11 212.11
## - medv                 1   176.32 212.32
## + lstat_medv_interact  1   173.99 213.99
## + indus_scaled         1   174.01 214.01
## + indus                1   174.01 214.01
## - rad                  1   178.43 214.43
## - dis                  1   180.97 216.97
## - log_dis              1   185.24 221.24
## - ptratio              1   186.71 222.71
## - nox                  1   189.74 225.74
## 
## Step:  AIC=210.22
## target ~ zn + chas + nox + rm + age + dis + rad + ptratio + lstat + 
##     medv + predicted_prob + log_medv + log_lstat + log_dis + 
##     age_group + tax_rad_interact
## 
##                       Df Deviance    AIC
## - log_medv             1   174.40 208.40
## - zn                   1   174.44 208.44
## - chas                 1   174.59 208.59
## - age_group            2   176.70 208.70
## - predicted_prob       1   174.70 208.70
## - lstat                1   176.13 210.13
## - rm                   1   176.19 210.19
## <none>                     174.22 210.22
## - age                  1   176.24 210.24
## - log_lstat            1   176.25 210.25
## - medv                 1   176.48 210.48
## - tax_rad_interact     1   176.71 210.71
## + tax                  1   174.03 212.03
## + logit                1   174.03 212.03
## + tax_scaled           1   174.03 212.03
## + lstat_medv_interact  1   174.14 212.14
## + indus                1   174.22 212.22
## + indus_scaled         1   174.22 212.22
## - rad                  1   179.80 213.80
## - dis                  1   182.17 216.17
## - log_dis              1   186.56 220.56
## - ptratio              1   187.37 221.37
## - nox                  1   189.82 223.82
## 
## Step:  AIC=208.4
## target ~ zn + chas + nox + rm + age + dis + rad + ptratio + lstat + 
##     medv + predicted_prob + log_lstat + log_dis + age_group + 
##     tax_rad_interact
## 
##                       Df Deviance    AIC
## - zn                   1   174.62 206.62
## - age_group            2   176.72 206.72
## - chas                 1   174.72 206.72
## - predicted_prob       1   174.78 206.78
## - rm                   1   176.20 208.20
## <none>                     174.40 208.40
## - age                  1   176.55 208.55
## - tax_rad_interact     1   176.79 208.79
## - log_lstat            1   177.39 209.39
## - lstat                1   177.78 209.78
## + log_medv             1   174.22 210.22
## + tax                  1   174.24 210.24
## + tax_scaled           1   174.24 210.24
## + logit                1   174.24 210.24
## + lstat_medv_interact  1   174.29 210.29
## + indus_scaled         1   174.40 210.40
## + indus                1   174.40 210.40
## - rad                  1   179.93 211.93
## - medv                 1   180.19 212.19
## - dis                  1   182.56 214.56
## - log_dis              1   187.24 219.24
## - ptratio              1   187.90 219.90
## - nox                  1   192.05 224.05
## 
## Step:  AIC=206.62
## target ~ chas + nox + rm + age + dis + rad + ptratio + lstat + 
##     medv + predicted_prob + log_lstat + log_dis + age_group + 
##     tax_rad_interact
## 
##                       Df Deviance    AIC
## - age_group            2   176.75 204.75
## - chas                 1   174.97 204.97
## - predicted_prob       1   175.65 205.65
## <none>                     174.62 206.62
## - age                  1   176.66 206.66
## - rm                   1   176.72 206.72
## - tax_rad_interact     1   176.81 206.81
## - log_lstat            1   177.53 207.53
## - lstat                1   177.88 207.88
## + logit                1   174.25 208.25
## + zn                   1   174.40 208.40
## + zn_scaled            1   174.40 208.40
## + log_medv             1   174.44 208.44
## + tax_scaled           1   174.51 208.51
## + tax                  1   174.51 208.51
## + lstat_medv_interact  1   174.52 208.52
## + indus                1   174.62 208.62
## + indus_scaled         1   174.62 208.62
## - rad                  1   179.93 209.93
## - medv                 1   180.34 210.34
## - dis                  1   185.08 215.08
## - log_dis              1   188.78 218.78
## - ptratio              1   190.24 220.24
## - nox                  1   192.08 222.08
## 
## Step:  AIC=204.75
## target ~ chas + nox + rm + age + dis + rad + ptratio + lstat + 
##     medv + predicted_prob + log_lstat + log_dis + tax_rad_interact
## 
##                       Df Deviance    AIC
## - chas                 1   177.01 203.01
## - predicted_prob       1   177.90 203.90
## - rm                   1   178.59 204.59
## - tax_rad_interact     1   178.62 204.62
## <none>                     176.75 204.75
## - log_lstat            1   180.45 206.45
## + age_group            2   174.62 206.62
## + logit                1   176.68 206.68
## + tax                  1   176.72 206.72
## + tax_scaled           1   176.72 206.72
## + zn                   1   176.72 206.72
## + zn_scaled            1   176.72 206.72
## + log_medv             1   176.73 206.73
## + indus_scaled         1   176.74 206.74
## + indus                1   176.74 206.74
## + lstat_medv_interact  1   176.75 206.75
## - lstat                1   181.07 207.07
## - age                  1   181.25 207.25
## - rad                  1   181.66 207.66
## - medv                 1   181.92 207.92
## - dis                  1   186.37 212.37
## - log_dis              1   189.72 215.72
## - ptratio              1   191.70 217.70
## - nox                  1   193.03 219.03
## 
## Step:  AIC=203.01
## target ~ nox + rm + age + dis + rad + ptratio + lstat + medv + 
##     predicted_prob + log_lstat + log_dis + tax_rad_interact
## 
##                       Df Deviance    AIC
## - predicted_prob       1   178.45 202.45
## - tax_rad_interact     1   178.96 202.96
## - rm                   1   178.99 202.99
## <none>                     177.01 203.01
## + chas                 1   176.75 204.75
## - log_lstat            1   180.86 204.86
## + logit                1   176.94 204.94
## + indus                1   176.96 204.96
## + indus_scaled         1   176.96 204.96
## + age_group            2   174.97 204.97
## + zn_scaled            1   176.97 204.97
## + zn                   1   176.97 204.97
## + tax_scaled           1   176.99 204.99
## + tax                  1   176.99 204.99
## + log_medv             1   177.00 205.00
## + lstat_medv_interact  1   177.01 205.01
## - age                  1   181.59 205.59
## - lstat                1   181.62 205.62
## - rad                  1   182.09 206.09
## - medv                 1   182.23 206.23
## - dis                  1   187.05 211.05
## - log_dis              1   190.08 214.08
## - ptratio              1   191.70 215.70
## - nox                  1   193.04 217.04
## 
## Step:  AIC=202.45
## target ~ nox + rm + age + dis + rad + ptratio + lstat + medv + 
##     log_lstat + log_dis + tax_rad_interact
## 
##                       Df Deviance    AIC
## <none>                     178.45 202.45
## - rm                   1   180.84 202.84
## + predicted_prob       1   177.01 203.01
## + logit                1   177.56 203.56
## + zn                   1   177.73 203.73
## + zn_scaled            1   177.73 203.73
## + chas                 1   177.90 203.90
## + indus                1   178.15 204.15
## + indus_scaled         1   178.15 204.15
## + tax_scaled           1   178.24 204.24
## + tax                  1   178.24 204.24
## + age_group            2   176.26 204.26
## + lstat_medv_interact  1   178.36 204.36
## + log_medv             1   178.41 204.41
## - log_lstat            1   182.72 204.72
## - lstat                1   183.31 205.31
## - tax_rad_interact     1   183.89 205.89
## - medv                 1   185.16 207.16
## - age                  1   188.21 210.21
## - dis                  1   191.26 213.26
## - ptratio              1   193.88 215.88
## - log_dis              1   198.19 220.19
## - rad                  1   199.02 221.02
## - nox                  1   272.24 294.24

# Logistic Regression Model 3 (With Transformations & Interactions)
model3 <- glm(target ~ log_medv + lstat + nox + ptratio + age_group, data=crime_training_df, family=binomial)


#summary(model3)
#AIC(model3)  # Compare AIC

# Summary of models
#summary(model1)
summary(model1)

## 
## Call:
## glm(formula = target ~ ., family = binomial, data = crime_training_df)
## 
## Coefficients: (10 not defined because of singularities)
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -3.234e+01  4.713e+01  -0.686 0.492539    
## zn                   -1.481e-02  3.416e-02  -0.433 0.664664    
## indus                 8.222e-03  5.831e-02   0.141 0.887866    
## chas                  4.753e-01  8.114e-01   0.586 0.558009    
## nox                   4.628e+01  1.712e+01   2.703 0.006869 ** 
## rm                   -1.130e+00  8.284e-01  -1.363 0.172751    
## age                   3.708e-02  2.566e-02   1.445 0.148416    
## dis                  -2.587e+00  1.109e+00  -2.333 0.019666 *  
## rad                   1.001e+00  5.071e-01   1.974 0.048395 *  
## tax                  -1.511e-03  3.831e-03  -0.394 0.693231    
## ptratio               4.609e-01  1.378e-01   3.345 0.000823 ***
## lstat                 2.604e-01  2.272e-01   1.146 0.251753    
## medv                  2.868e-01  2.685e-01   1.068 0.285532    
## predicted_prob        1.249e+00  2.387e+00   0.523 0.600865    
## logit                        NA         NA      NA       NA    
## log_medv             -4.719e+00  1.470e+01  -0.321 0.748087    
## log_lstat            -5.993e+00  1.358e+01  -0.441 0.659011    
## log_dis               1.645e+01  5.815e+00   2.829 0.004672 ** 
## zn_scaled                    NA         NA      NA       NA    
## indus_scaled                 NA         NA      NA       NA    
## nox_scaled                   NA         NA      NA       NA    
## rm_scaled                    NA         NA      NA       NA    
## age_scaled                   NA         NA      NA       NA    
## dis_scaled                   NA         NA      NA       NA    
## rad_scaled                   NA         NA      NA       NA    
## tax_scaled                   NA         NA      NA       NA    
## ptratio_scaled               NA         NA      NA       NA    
## age_groupMiddle-aged -1.761e+00  1.297e+00  -1.358 0.174571    
## age_groupOld         -1.291e+00  1.729e+00  -0.747 0.455306    
## lstat_medv_interact   7.418e-01  3.595e+00   0.206 0.836521    
## tax_rad_interact     -9.495e-04  7.463e-04  -1.272 0.203281    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 645.88  on 465  degrees of freedom
## Residual deviance: 173.97  on 445  degrees of freedom
## AIC: 215.97
## 
## Number of Fisher Scoring iterations: 10

#summary(model3)

Model Evaluation:

Evaluate model performance and select the best model based on multiple criteria.

Evaluation Metrics: Accuracy: (TP + TN) / (TP + TN + FP + FN) Precision: TP / (TP + FP) Recall (Sensitivity): TP / (TP + FN) Specificity: TN / (TN + FP) F1 Score: 2 * (Precision * Recall) / (Precision + Recall) AUC-ROC Curve: Evaluate model discrimination.

# Load necessary library
library(caret)
library(pROC)

# Predict on training data
pred_probs <- predict(model1, type="response")
pred_classes <- ifelse(pred_probs > 0.5, 1, 0)

# Confusion Matrix
conf_matrix <- table(Predicted=pred_classes, Actual=crime_training_df$target)
print(conf_matrix)

##          Actual
## Predicted   0   1
##         0 223  15
##         1  14 214

# Compute accuracy, precision, recall, F1-score
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2,2] / sum(conf_matrix[,2])
recall <- conf_matrix[2,2] / sum(conf_matrix[2,])
f1_score <- 2 * (precision * recall) / (precision + recall)
f1_score

## [1] 0.9365427

# Print performance metrics
cat("Accuracy:", accuracy, "\nPrecision:", precision, "\nRecall:", recall, "\nF1 Score:", f1_score)

## Accuracy: 0.9377682 
## Precision: 0.9344978 
## Recall: 0.9385965 
## F1 Score: 0.9365427

Predict Probabilities for Training Data

pred_prob_model1 <- predict(model1, type = "response")
pred_prob_model2 <- predict(model2, type = "response")
pred_prob_model3 <- predict(model3, type = "response")

# Convert probabilities into binary classifications (Threshold = 0.5)
pred_class_model1 <- ifelse(pred_prob_model1 > 0.5, 1, 0)
pred_class_model2 <- ifelse(pred_prob_model2 > 0.5, 1, 0)
pred_class_model3 <- ifelse(pred_prob_model3 > 0.5, 1, 0)

Compute Confusion Matrices

conf_matrix_model1 <- table(Predicted = pred_class_model1, Actual = crime_training_df$target)
conf_matrix_model2 <- table(Predicted = pred_class_model2, Actual = crime_training_df$target)
conf_matrix_model3 <- table(Predicted = pred_class_model3, Actual = crime_training_df$target)

print(conf_matrix_model1)

##          Actual
## Predicted   0   1
##         0 223  15
##         1  14 214

print(conf_matrix_model2)

##          Actual
## Predicted   0   1
##         0 222  19
##         1  15 210

print(conf_matrix_model3)

##          Actual
## Predicted   0   1
##         0 207  39
##         1  30 190

Evaluate Performance Metrics

# Define a function to compute accuracy, precision, recall, and F1-score
evaluate_model <- function(conf_matrix) {
  accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
  precision <- conf_matrix[2,2] / sum(conf_matrix[,2])
  recall <- conf_matrix[2,2] / sum(conf_matrix[2,])
  f1_score <- 2 * ((precision * recall) / (precision + recall))
  
  return(list(Accuracy = accuracy, Precision = precision, Recall = recall, F1_Score = f1_score))
}

eval_model1 <- evaluate_model(conf_matrix_model1)
eval_model2 <- evaluate_model(conf_matrix_model2)
eval_model3 <- evaluate_model(conf_matrix_model3)

print(eval_model1)

## $Accuracy
## [1] 0.9377682
## 
## $Precision
## [1] 0.9344978
## 
## $Recall
## [1] 0.9385965
## 
## $F1_Score
## [1] 0.9365427

print(eval_model2)

## $Accuracy
## [1] 0.9270386
## 
## $Precision
## [1] 0.9170306
## 
## $Recall
## [1] 0.9333333
## 
## $F1_Score
## [1] 0.9251101

print(eval_model3)

## $Accuracy
## [1] 0.8519313
## 
## $Precision
## [1] 0.8296943
## 
## $Recall
## [1] 0.8636364
## 
## $F1_Score
## [1] 0.8463252

Compute ROC & AUC for Models

roc_model1 <- roc(crime_training_df$target, pred_prob_model1)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

roc_model2 <- roc(crime_training_df$target, pred_prob_model2)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

roc_model3 <- roc(crime_training_df$target, pred_prob_model3)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

roc_model1

## 
## Call:
## roc.default(response = crime_training_df$target, predictor = pred_prob_model1)
## 
## Data: pred_prob_model1 in 237 controls (crime_training_df$target 0) < 229 cases (crime_training_df$target 1).
## Area under the curve: 0.9782

roc_model2

## 
## Call:
## roc.default(response = crime_training_df$target, predictor = pred_prob_model2)
## 
## Data: pred_prob_model2 in 237 controls (crime_training_df$target 0) < 229 cases (crime_training_df$target 1).
## Area under the curve: 0.9776

roc_model3

## 
## Call:
## roc.default(response = crime_training_df$target, predictor = pred_prob_model3)
## 
## Data: pred_prob_model3 in 237 controls (crime_training_df$target 0) < 229 cases (crime_training_df$target 1).
## Area under the curve: 0.9472

# Plot ROC curves for comparison
plot(roc_model1, col="red", main="ROC Curves for Logistic Models")
plot(roc_model2, col="blue", add=TRUE)
plot(roc_model3, col="green", add=TRUE)
legend("bottomright", legend=c("Model 1", "Model 2", "Model 3"), col=c("red", "blue", "green"), lwd=2)

Model Accuracy Precision Recall F1 Score AUC Model 1 0.938 0.934 0.939 0.937 0.9782 Model 2 0.927 0.917 0.933 0.925 0.9776 Model 3 0.852 0.830 0.864 0.846 0.9472

Best Choice: Model 1 Model 1 is the best choice because:

Highest Accuracy (0.938) → It makes the fewest classification errors. Highest F1 Score (0.937) → It balances precision and recall effectively. Highest AUC (0.9782) → It has the best ability to distinguish between classes

Model 1 is the most reliable, most precise, and best at distinguishing high-crime neighborhoods. Model 2 performs well but has slightly lower Accuracy (0.927) and AUC (0.9776). Model 3 performs significantly worse, with much lower Accuracy (0.852) and AUC (0.9472).

What Does “Fewest Classification Errors” Mean? A classification error happens when the model predicts the wrong category:

False Negative (FN) → Model predicts 0 (low crime), but actual crime rate is high (1). False Positive (FP) → Model predicts 1 (high crime), but actual crime rate is low (0). Higher accuracy means fewer FPs & FNs → More correct predictions.

Accuracy measures overall correctness of the classification model. It represents the proportion of correct predictions (both positive and negative) out of all predictions.

Accuracy= (TP+TN)/(TP+FP+TN+FN)

Why Is This Important in Crime Prediction?

False Negatives Are Dangerous (Crime exists but is predicted as “safe”) Consequence: A high-crime area is misclassified as low-crime → Authorities don’t take necessary action.

Police resources may not be allocated where crime is actually high. Residents feel falsely safe but face real crime threats. Crime could increase due to lack of intervention. False Positives Cause Unnecessary Fear & Resource Waste (Safe area misclassified as high-crime) Consequence: A low-crime area is misclassified as high-crime → Unnecessary actions are taken.

Increased policing in safe areas, wasting law enforcement resources. Real estate values may drop due to incorrect classification. Residents may feel unsafe, even though crime is low.

Bottom Line: Highest accuracy (0.938) = Best model for crime prevention

What Does the Highest F1 Score (0.937) Mean?

F1 Score of 0.937 means that model balances Precision and Recall very well. F1 = 2 x (Precision x Sensitivity)/(Precision + Sensitivity)

Precision (“When the model says high crime, is it actually high crime?”) High Precision = Few False Positives (FP) → Model doesn’t falsely label safe areas as high crime.

Recall (“Did the model find all the high-crime areas?”) High Recall = Few False Negatives (FN) → Model doesn’t miss actual high-crime areas.

A high F1 Score (0.937) means: Few False Positives (Safe areas aren’t wrongly classified as high crime) Few False Negatives (Actual high-crime areas are correctly identified) The model is well-balanced and reliable.

What Does the Highest AUC (0.9782) Mean?

ROC (Receiver Operating Characteristic) curve is a graph that shows how well a model can separate two classes (e.g., high-crime vs. low-crime areas). AUC (Area Under the Curve) measures how well your model separates two classes of high-crime (1) vs. low-crime (0) areas.

A high AUC (0.9782) means:

Model correctly distinguishes between crime-prone and safe areas most of the time. It rarely confuses high-crime neighborhoods with low-crime ones. It performs better than random guessing (which has an AUC of 0.5). In simple terms: AUC = 0.9782, meaning model is very effective at separating high-crime and low-crime areas!

Apply the Best Model to Evaluation Data

Once the best model is selected, we use it for prediction on crime_evaluation_df.

# Handle missing values (if any)
#crime_training_df$medv[is.na(crime_training_df$medv)] <- median(crime_training_df$medv, na.rm = TRUE)

# Fill missing values with median
crime_evaluation_df$medv[is.na(crime_evaluation_df$medv)] <- median(crime_evaluation_df$medv, na.rm = TRUE)
crime_evaluation_df$lstat[is.na(crime_evaluation_df$lstat)] <- median(crime_evaluation_df$lstat, na.rm = TRUE)
crime_evaluation_df$dis[is.na(crime_evaluation_df$dis)] <- median(crime_evaluation_df$dis, na.rm = TRUE)

# Log transformation
#crime_training_df$log_medv <- log(crime_training_df$medv + 1)

crime_evaluation_df <- crime_evaluation_df %>%
  mutate(
    log_medv = log(medv + 1),  # Avoid log(0)
    log_lstat = log(lstat + 1),
    log_dis = log(dis + 1)
  )


# Standardization
#crime_training_df$zn <- scale(crime_training_df$zn)
#crime_training_df$indus <- scale(crime_training_df$indus)

#Standardization (Normalize Continuous Variables)
#Standardization ensures that variables are on the same scale, improving model stability.
crime_evaluation_df <- crime_evaluation_df %>%
  mutate(
    zn_scaled = scale(zn),
    indus_scaled = scale(indus),
    nox_scaled = scale(nox),
    rm_scaled = scale(rm),
    age_scaled = scale(age),
    dis_scaled = scale(dis),
    rad_scaled = scale(rad),
    tax_scaled = scale(tax),
    ptratio_scaled = scale(ptratio)
  )


# Create categorical age groups bins
crime_evaluation_df$age_group <- cut(crime_evaluation_df$age, breaks=c(0, 30, 60, 100), labels=c("Young", "Middle-aged", "Old"))

# Create Interaction Terms
crime_evaluation_df <- crime_evaluation_df %>%
  mutate(
    lstat_medv_interact = log_lstat * log_medv,  # Income & housing price interaction
    tax_rad_interact = tax * rad                # Tax burden & highway accessibility
  )

colnames(crime_evaluation_df)

##  [1] "zn"                  "indus"               "chas"               
##  [4] "nox"                 "rm"                  "age"                
##  [7] "dis"                 "rad"                 "tax"                
## [10] "ptratio"             "lstat"               "medv"               
## [13] "log_medv"            "log_lstat"           "log_dis"            
## [16] "zn_scaled"           "indus_scaled"        "nox_scaled"         
## [19] "rm_scaled"           "age_scaled"          "dis_scaled"         
## [22] "rad_scaled"          "tax_scaled"          "ptratio_scaled"     
## [25] "age_group"           "lstat_medv_interact" "tax_rad_interact"

#colnames(crime_training_df)

# Predict probabilities using model1
eval_pred_prob <- predict(model2, newdata = crime_evaluation_df, type = "response")

# Convert probabilities to binary class (0 or 1) using a threshold of 0.5
eval_pred_class <- ifelse(eval_pred_prob > 0.5, 1, 0)

# Add predictions to the evaluation data
crime_evaluation_df$predicted_prob <- eval_pred_prob
crime_evaluation_df$predicted_class <- eval_pred_class


file_path <- "C:/Users/Uzma/Downloads/crime_evaluation_df.csv"

# Write dataframe to CSV
write.csv(crime_evaluation_df, file = file_path, row.names = FALSE)
crime_evaluation_df

## # A tibble: 40 × 29
##       zn indus  chas   nox    rm   age   dis   rad   tax ptratio lstat  medv
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>
##  1     0  7.07     0 0.469  7.18  61.1  4.97     2   242    17.8  4.03  34.7
##  2     0  8.14     0 0.538  6.10  84.5  4.46     4   307    21   10.3   18.2
##  3     0  8.14     0 0.538  6.50  94.4  4.45     4   307    21   12.8   18.4
##  4     0  8.14     0 0.538  5.95  82    3.99     4   307    21   27.7   13.2
##  5     0  5.96     0 0.499  5.85  41.5  3.93     5   279    19.2  8.77  21  
##  6    25  5.13     0 0.453  5.74  66.2  7.23     8   284    19.7 13.2   18.7
##  7    25  5.13     0 0.453  5.97  93.4  6.82     8   284    19.7 14.4   16  
##  8     0  4.49     0 0.449  6.63  56.1  4.44     3   247    18.5  6.53  26.6
##  9     0  4.49     0 0.449  6.12  56.8  3.75     3   247    18.5  8.44  22.2
## 10     0  2.89     0 0.445  6.16  69.6  3.50     2   276    18   11.3   21.4
## # ℹ 30 more rows
## # ℹ 17 more variables: log_medv <dbl>, log_lstat <dbl>, log_dis <dbl>,
## #   zn_scaled <dbl[,1]>, indus_scaled <dbl[,1]>, nox_scaled <dbl[,1]>,
## #   rm_scaled <dbl[,1]>, age_scaled <dbl[,1]>, dis_scaled <dbl[,1]>,
## #   rad_scaled <dbl[,1]>, tax_scaled <dbl[,1]>, ptratio_scaled <dbl[,1]>,
## #   age_group <fct>, lstat_medv_interact <dbl>, tax_rad_interact <dbl>,
## #   predicted_prob <dbl>, predicted_class <dbl>

Assignment3_Data621

Mubashira Qari

2025-03-09