# Load required packages
library(htmltools)
## Warning: package 'htmltools' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.3
library(pROC)
## Warning: package 'pROC' was built under R version 4.3.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(tidyverse)
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
library(skimr)
## Warning: package 'skimr' was built under R version 4.3.3
require(DataExplorer)
## Loading required package: DataExplorer
## Warning: package 'DataExplorer' was built under R version 4.3.3
require(miscTools)
## Loading required package: miscTools
## Warning: package 'miscTools' was built under R version 4.3.3
require(MASS)
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
require(performance)
## Loading required package: performance
## Warning: package 'performance' was built under R version 4.3.3
require(lmtest)
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 4.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
require(mice)
## Loading required package: mice
## Warning: package 'mice' was built under R version 4.3.3
## Warning in check_dep_version(): ABI version mismatch:
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
require(glmnet)
## Loading required package: glmnet
## Warning: package 'glmnet' was built under R version 4.3.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
require(Metrics)
## Loading required package: Metrics
## Warning: package 'Metrics' was built under R version 4.3.3
##
## Attaching package: 'Metrics'
##
## The following objects are masked from 'package:performance':
##
## mae, mse, rmse
##
## The following object is masked from 'package:pROC':
##
## auc
##
## The following objects are masked from 'package:caret':
##
## precision, recall
• zn: proportion of residential land zoned for large lots (over 25000
square feet) (predictor variable)
• indus: proportion of non-retail business acres per suburb (predictor
variable)
• chas: a dummy var. for whether the suburb borders the Charles River
(1) or not (0) (predictor variable)
• nox: nitrogen oxides concentration (parts per 10 million) (predictor
variable)
• rm: average number of rooms per dwelling (predictor variable)
• age: proportion of owner-occupied units built prior to 1940 (predictor
variable)
• dis: weighted mean of distances to five Boston employment centers
(predictor variable)
• rad: index of accessibility to radial highways (predictor
variable)
• tax: full-value property-tax rate per $10,000 (predictor
variable)
• ptratio: pupil-teacher ratio by town (predictor variable)
• lstat: lower status of the population (percent) (predictor
variable)
• medv: median value of owner-occupied homes in $1000s (predictor
variable)
• target: whether the crime rate is above the median crime rate (1) or
not (0) (response variable)
crime_training_df <- read_csv("https://raw.githubusercontent.com/uzmabb182/Data_621/refs/heads/main/Assignment3/crime-training-data_modified.csv")
## Rows: 466 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (13): zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, lstat, medv...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
crime_evaluation_df <- read_csv("https://raw.githubusercontent.com/uzmabb182/Data_621/refs/heads/main/Assignment3/crime-evaluation-data_modified.csv")
## Rows: 40 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (12): zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, lstat, medv
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(crime_evaluation_df)
## # A tibble: 6 × 12
## zn indus chas nox rm age dis rad tax ptratio lstat medv
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 7.07 0 0.469 7.18 61.1 4.97 2 242 17.8 4.03 34.7
## 2 0 8.14 0 0.538 6.10 84.5 4.46 4 307 21 10.3 18.2
## 3 0 8.14 0 0.538 6.50 94.4 4.45 4 307 21 12.8 18.4
## 4 0 8.14 0 0.538 5.95 82 3.99 4 307 21 27.7 13.2
## 5 0 5.96 0 0.499 5.85 41.5 3.93 5 279 19.2 8.77 21
## 6 25 5.13 0 0.453 5.74 66.2 7.23 8 284 19.7 13.2 18.7
#dim(crime_training_df)
skim(crime_training_df)
Name | crime_training_df |
Number of rows | 466 |
Number of columns | 13 |
_______________________ | |
Column type frequency: | |
numeric | 13 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
zn | 0 | 1 | 11.58 | 23.36 | 0.00 | 0.00 | 0.00 | 16.25 | 100.00 | ▇▁▁▁▁ |
indus | 0 | 1 | 11.11 | 6.85 | 0.46 | 5.15 | 9.69 | 18.10 | 27.74 | ▇▆▁▇▁ |
chas | 0 | 1 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
nox | 0 | 1 | 0.55 | 0.12 | 0.39 | 0.45 | 0.54 | 0.62 | 0.87 | ▇▇▅▃▁ |
rm | 0 | 1 | 6.29 | 0.70 | 3.86 | 5.89 | 6.21 | 6.63 | 8.78 | ▁▂▇▂▁ |
age | 0 | 1 | 68.37 | 28.32 | 2.90 | 43.88 | 77.15 | 94.10 | 100.00 | ▂▂▂▃▇ |
dis | 0 | 1 | 3.80 | 2.11 | 1.13 | 2.10 | 3.19 | 5.21 | 12.13 | ▇▅▂▁▁ |
rad | 0 | 1 | 9.53 | 8.69 | 1.00 | 4.00 | 5.00 | 24.00 | 24.00 | ▇▂▁▁▃ |
tax | 0 | 1 | 409.50 | 167.90 | 187.00 | 281.00 | 334.50 | 666.00 | 711.00 | ▇▇▅▁▇ |
ptratio | 0 | 1 | 18.40 | 2.20 | 12.60 | 16.90 | 18.90 | 20.20 | 22.00 | ▁▃▅▅▇ |
lstat | 0 | 1 | 12.63 | 7.10 | 1.73 | 7.04 | 11.35 | 16.93 | 37.97 | ▇▇▅▂▁ |
medv | 0 | 1 | 22.59 | 9.24 | 5.00 | 17.02 | 21.20 | 25.00 | 50.00 | ▂▇▅▁▁ |
target | 0 | 1 | 0.49 | 0.50 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▇ |
Missing Values & Completeness Rate The n_missing column shows that there are no missing values (0) for any variable, meaning we don’t need to perform imputation. The complete_rate column confirms this, as all variables have a completeness rate of 1, meaning every row has a value for these variables.
Descriptive Statistics
Mean (mean): The average value of each variable. Standard Deviation (sd): Measures the spread of values. Minimum (p0): The lowest observed value (0th percentile). First Quartile (p25): The 25th percentile, where 25% of the values are below this number. Median (p50): The 50th percentile (the middle value). Third Quartile (p75): The 75th percentile, where 75% of the values are below this number.
Key Observations:
Crime Rate Target (target)
The median (p50) is 0, indicating that more than half of the data points fall in the low-crime category (target = 0).
Median Home Value (medv) Mean = 22.59 ($22,590 in $1000s), Median = 21.2. The range (p0 = 5, p75 = 25) suggests that most homes are valued between $5,000 and $25,000 (in $1000s). The standard deviation (9.23) indicates a relatively high spread in home values.
Lower Status Population (lstat) Mean = 12.63%, Median = 11.93%. A positively skewed distribution (p0 = 1.73, p75 = 16.93), meaning some areas have much higher lower-status populations than others.
Property Tax Rate (tax) High variance (Mean = 409.5, SD = 167.9). Large difference between the 25th percentile (281) and 75th percentile (666), suggesting significant variability in tax rates among neighborhoods.
Average Number of Rooms (rm) Mean = 6.29, Median = 6.21, with a relatively small spread (SD = 0.70). Indicates most homes have around 6 rooms.
Distance to Employment Centers (dis) Median = 3.19, but the 25th percentile is quite low (2.10), meaning some neighborhoods are much closer to employment centers than others. Higher standard deviation (2.1) suggests some neighborhoods are much more remote.
Industrial Land Proportion (indus) Mean = 11.10, Median = 9.69, and right-skewed distribution (p0 = 0.46, p75 = 18.1). Some areas have much higher proportions of industrial land, potentially influencing crime.
Highway Accessibility (rad) Highly right-skewed: The median is 5, but the 75th percentile is 24, meaning some neighborhoods have much greater access to highways than others. This might be an important predictor for crime.
Potential Data Transformations zn, indus, tax, rad, lstat, and medv show skewness, so applying a log transformation might improve normality. age can be categorized into bins (e.g., young, middle-aged, old) since it ranges from 2.9 to 94.1. dis has a wide range, so normalization might be needed.
Before interpreting results from a binary logistic regression, we must verify three key assumptions:
Independence of Observations Linearity of the Logit No Multicollinearity
Now, let’s discuss how to check these assumptions and which correlation method is best for your data (which contains both ordinal and continuous variables).
Pearson Correlation, Spearman Rank Correlation, and Kendall’s Tau Rank Correlation are all methods used to measure the strength and direction of relationships between variables.
However, they differ in terms of their assumptions, use cases, and how they quantify relationships.
Pearson Correlation: Suitable for continuous data when you want to measure linear associations.
Spearman Rank Correlation: AAppropriate for both continuous and ordinal data. Particularly useful when the relationship is expected to be monotonic but not necessarily linear.
Kendall’s Tau Rank Correlation: Suitable for both continuous and ordinal (ranked) data. Useful when the data may not follow a linear relationship.
Since your data has both ordinal and continuous variables, Spearman’s correlation is the best choice because:
It does not assume normality (unlike Pearson). It can handle ordinal variables (like chas, rad). It is more robust against outliers than Pearson.
The independence assumption in binary logistic regression states that each observation (row) in the dataset should be independent of the others. This means:
No duplicated data points (e.g., same neighborhood appearing multiple times). No clustered observations (e.g., observations grouped by region, time, or other factors). No strong correlations between residuals of observations, meaning observations do not systematically affect each other.
Spearman’s correlation measures monotonic relationships between variables, making it suitable when we have a mix of ordinal and continuous predictors. If there is high correlation between observations, it suggests possible dependence (e.g., neighborhoods with similar crime rates).
We calculate the Spearman correlation matrix for all numeric variables (excluding target), which tells us if some variables are highly dependent (correlated).
library(ggplot2)
library(tidyr)
library(dplyr)
# Exclude categorical columns before pivoting
crime_long <- crime_training_df %>%
dplyr::select(where(is.numeric)) %>% # Keep only numeric columns
pivot_longer(cols = -target, names_to = "Variable", values_to = "Value")
# Create scatter plots for each predictor vs target with a fitted line
ggplot(crime_long, aes(x = Value, y = target)) +
geom_point(alpha = 0.5, color = "blue") + # Scatter plot
geom_smooth(method = "lm", color = "red", se = FALSE) + # Linear fit
facet_wrap(~Variable, scales = "free") + # Multiple plots for each variable
theme_minimal() +
labs(title = "Independence Check: Scatter Plots of Predictors vs Target",
x = "Predictor Value",
y = "Crime Rate (Binary Target)")
## `geom_smooth()` using formula = 'y ~ x'
# Load necessary library
library(dplyr)
# Initialize an empty data frame to store results
independence_results <- data.frame(Variable = character(),
Correlation = numeric(),
P_Value = numeric(),
stringsAsFactors = FALSE)
# Loop through each predictor variable (excluding the target)
for (var in colnames(crime_training_df)[colnames(crime_training_df) != "target"]) {
# Ensure the variable is numeric before computing Spearman correlation
if (is.numeric(crime_training_df[[var]])) {
# Perform Spearman correlation test
test_result <- cor.test(crime_training_df[[var]], crime_training_df$target, method = "spearman")
# Store results in a data frame
independence_results <- rbind(independence_results,
data.frame(Variable = var,
Correlation = test_result$estimate,
P_Value = test_result$p.value))
}
}
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
## Warning in cor.test.default(crime_training_df[[var]], crime_training_df$target,
## : Cannot compute exact p-value with ties
# View results in tabular format
print(independence_results)
## Variable Correlation P_Value
## rho zn -0.47299691 2.365331e-27
## rho1 indus 0.61915270 1.159150e-50
## rho2 chas 0.08004187 8.434811e-02
## rho3 nox 0.75471235 5.629874e-87
## rho4 rm -0.17719123 1.204401e-04
## rho5 age 0.64569520 2.544840e-56
## rho6 dis -0.65908410 2.147507e-59
## rho7 rad 0.57842556 5.796797e-43
## rho8 tax 0.59604354 3.693938e-46
## rho9 ptratio 0.35733899 1.756795e-15
## rho10 lstat 0.47946598 3.674288e-28
## rho11 medv -0.40154322 1.755130e-19
In Spearman’s method, we check the correlation between:
Each independent variable and the dependent variable (target) If correlation is too low (|𝜌| < 0.1) and p-value > 0.05, the variable might not be useful in predicting the target.
Variables to Consider Removing:
chas (ρ = 0.0800, p = 0.0843) → No meaningful correlation with crime. Possibly rm (ρ = -0.1772, p = 0.00012) → Weak correlation but could check its importance in the model.
Variables to Keep (for now, but monitor multicollinearity):
nox (ρ = 0.7547) age (ρ = 0.6457) dis (ρ = -0.6591) indus (ρ = 0.6192) rad, tax, ptratio (moderate correlation) Transform / Create Interaction Terms:
Log transform: dis (since it has a strong negative correlation) Categorize: age into “Young”, “Middle-aged”, “Old” Interaction term: tax * rad (both impact crime)
# Load necessary library
library(corrplot)
library(dplyr)
crime_training_df <- crime_training_df %>%
mutate(across(where(is.character), as.numeric))
crime_evaluation_df <- crime_evaluation_df %>%
mutate(across(where(is.character), as.numeric))
# Verify again
str(crime_training_df)
## tibble [466 × 13] (S3: tbl_df/tbl/data.frame)
## $ zn : num [1:466] 0 0 0 30 0 0 0 0 0 80 ...
## $ indus : num [1:466] 19.58 19.58 18.1 4.93 2.46 ...
## $ chas : num [1:466] 0 1 0 0 0 0 0 0 0 0 ...
## $ nox : num [1:466] 0.605 0.871 0.74 0.428 0.488 0.52 0.693 0.693 0.515 0.392 ...
## $ rm : num [1:466] 7.93 5.4 6.49 6.39 7.16 ...
## $ age : num [1:466] 96.2 100 100 7.8 92.2 71.3 100 100 38.1 19.1 ...
## $ dis : num [1:466] 2.05 1.32 1.98 7.04 2.7 ...
## $ rad : num [1:466] 5 5 24 6 3 5 24 24 5 1 ...
## $ tax : num [1:466] 403 403 666 300 193 384 666 666 224 315 ...
## $ ptratio: num [1:466] 14.7 14.7 20.2 16.6 17.8 20.9 20.2 20.2 20.2 16.4 ...
## $ lstat : num [1:466] 3.7 26.82 18.85 5.19 4.82 ...
## $ medv : num [1:466] 50 13.4 15.4 23.7 37.9 26.5 5 7 22.2 20.9 ...
## $ target : num [1:466] 1 1 1 0 0 0 1 1 0 0 ...
### Visualize Correlation Matrix
# Compute Spearman correlation matrix
numeric_data <- crime_training_df %>% dplyr::select(where(is.numeric), -target)
spearman_cor <- cor(numeric_data, method = "spearman", use = "pairwise.complete.obs")
# Plot the Spearman correlation matrix with labels
corrplot(spearman_cor,
method = "color", # Color-coded correlation plot
type = "upper", # Show only upper triangle
tl.cex = 0.8, # Adjust text size of axis labels
addCoef.col = "black", # Add correlation values in black
number.cex = 0.7, # Adjust correlation label size
col = colorRampPalette(c("blue", "white", "red"))(200) # Color gradient
)
### Checking for Multicollinearity (High Correlation Between Predictors)
A general rule of thumb is that if |𝜌| > 0.7, it indicates strong
correlation between variables, which can lead to multicollinearity in
the logistic regression model.
From the matrix: indus & nox (𝜌 = 0.79) → Strong positive correlation, meaning they provide redundant information. tax & rad (𝜌 = 0.70) → These variables are highly correlated, indicating one may be removed. lstat & medv (𝜌 = -0.85) → Very strong negative correlation; keeping both might be problematic. log_medv & medv (𝜌 = 1.00) → Perfect correlation (since log transformation was applied), meaning one must be removed to prevent redundancy.
Final Takeaways High correlation between predictors (|𝜌| > 0.7) indicates potential multicollinearity. Consider removing indus, rad, or medv to improve model stability. No evidence of entire rows/columns being highly correlated, suggesting no major independence violations. Use VIF for confirmation and decide on variable selection accordingly.
Variance Inflation Factor (VIF) helps quantify multicollinearity by measuring how much the variance of regression coefficients is inflated due to correlation among predictors. A VIF > 5 (or more conservatively, VIF > 10) suggests severe multicollinearity.
VIF helps confirm whether these high correlations affect regression coefficients.
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
vif(glm(target ~ nox + age + rad + tax + dis + zn + medv, family = binomial, data = crime_training_df))
## nox age rad tax dis zn medv
## 2.859311 1.548306 1.421341 1.766596 3.576500 1.667181 1.784929
Conclusion There is NO severe multicollinearity (all VIF values are below 5). No immediate need to drop variables based on VIF. The variable dis (VIF = 3.58) shows moderate correlation with other predictors, but it’s not problematic.
Next Steps:
Keep all predictors in the model. If we suspect redundancy, check pairwise correlations again or test removing dis to see if model performance improves.
In logistic regression, we assume that each continuous predictor has a linear relationship with the log-odds (logit) of the target variable. If this assumption is violated, the model may be misleading or inaccurate.
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(tidyr)
# Fit the logistic regression model
model <- glm(target ~ nox + age + rad + tax + dis + zn + medv,
family = binomial, data = crime_training_df)
# Compute predicted probabilities
crime_training_df$predicted_prob <- predict(model, type = "response")
# Compute logit (log-odds) transformation
crime_training_df$logit <- log(crime_training_df$predicted_prob /
(1 - crime_training_df$predicted_prob))
# Reshape data for faceting
plot_data <- crime_training_df %>%
dplyr::select(logit, nox, age, rad, tax, dis, zn, medv) %>%
pivot_longer(cols = -logit, names_to = "Predictor", values_to = "Value")
# Create faceted scatter plot
ggplot(plot_data, aes(x = Value, y = logit)) +
geom_point(alpha = 0.5, color = "blue") +
geom_smooth(method = "loess", color = "red", se = FALSE) +
facet_wrap(~ Predictor, scales = "free") +
theme_minimal() +
labs(title = "Linearity of Logit Check for Binary Logistic Regression",
x = "Predictor Variable",
y = "Logit of Predicted Probability")
## `geom_smooth()` using formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at -0.5
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 13
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 156.25
### Interpretation of the Linearity of Logit Check for Binary Logistic
Regression This faceted scatter plot assesses the assumption of
linearity of the logit for binary logistic regression. The blue dots
represent the relationship between each predictor variable and the logit
of the predicted probability, while the red LOESS (Locally Estimated
Scatterplot Smoothing) curve helps visualize patterns.
Several predictors (e.g., dis, medv, tax, zn) violate the linearity of logit assumption.
This step involves cleaning and transforming data to improve model performance.
Tasks: Handle missing values: If missing values exist, replace with the mean/median or use imputation methods.
Feature engineering: Log transformations: medv, lstat, and dis might benefit from log transformations to reduce skewness. Binning: Convert age into categorical buckets (e.g., young, middle-aged, old). Interactions: Create new features (e.g., lstat*medv to capture relationships between income and home values). Standardization: Normalize numerical variables to bring them to the same scale.
# Handle missing values (if any)
#crime_training_df$medv[is.na(crime_training_df$medv)] <- median(crime_training_df$medv, na.rm = TRUE)
# Fill missing values with median
crime_training_df$medv[is.na(crime_training_df$medv)] <- median(crime_training_df$medv, na.rm = TRUE)
crime_training_df$lstat[is.na(crime_training_df$lstat)] <- median(crime_training_df$lstat, na.rm = TRUE)
crime_training_df$dis[is.na(crime_training_df$dis)] <- median(crime_training_df$dis, na.rm = TRUE)
# Log transformation
#crime_training_df$log_medv <- log(crime_training_df$medv + 1)
crime_training_df <- crime_training_df %>%
mutate(
log_medv = log(medv + 1), # Avoid log(0)
log_lstat = log(lstat + 1),
log_dis = log(dis + 1)
)
# Standardization
#crime_training_df$zn <- scale(crime_training_df$zn)
#crime_training_df$indus <- scale(crime_training_df$indus)
#Standardization (Normalize Continuous Variables)
#Standardization ensures that variables are on the same scale, improving model stability.
crime_training_df <- crime_training_df %>%
mutate(
zn_scaled = scale(zn),
indus_scaled = scale(indus),
nox_scaled = scale(nox),
rm_scaled = scale(rm),
age_scaled = scale(age),
dis_scaled = scale(dis),
rad_scaled = scale(rad),
tax_scaled = scale(tax),
ptratio_scaled = scale(ptratio)
)
# Create categorical age groups bins
crime_training_df$age_group <- cut(crime_training_df$age, breaks=c(0, 30, 60, 100), labels=c("Young", "Middle-aged", "Old"))
# Create Interaction Terms
crime_training_df <- crime_training_df %>%
mutate(
lstat_medv_interact = log_lstat * log_medv, # Income & housing price interaction
tax_rad_interact = tax * rad # Tax burden & highway accessibility
)
colnames(crime_training_df)
## [1] "zn" "indus" "chas"
## [4] "nox" "rm" "age"
## [7] "dis" "rad" "tax"
## [10] "ptratio" "lstat" "medv"
## [13] "target" "predicted_prob" "logit"
## [16] "log_medv" "log_lstat" "log_dis"
## [19] "zn_scaled" "indus_scaled" "nox_scaled"
## [22] "rm_scaled" "age_scaled" "dis_scaled"
## [25] "rad_scaled" "tax_scaled" "ptratio_scaled"
## [28] "age_group" "lstat_medv_interact" "tax_rad_interact"
Applying three logistic regression models using different variable selections or transformations.
Approach: Model 1 (Baseline Model): Use all predictors. Model 2 (Stepwise Selection): Use stepAIC() to select important variables. Model 3 (Transformed Variables): Use transformed and interaction variables.
# Load library
library(MASS)
# Logistic Regression Model 1 (Baseline)
model1 <- glm(target ~ ., data=crime_training_df, family=binomial)
# Logistic Regression Model 2 (Stepwise Selection)
model2 <- stepAIC(glm(target ~ ., data=crime_training_df, family=binomial), direction="both")
## Start: AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
## ptratio + lstat + medv + predicted_prob + logit + log_medv +
## log_lstat + log_dis + zn_scaled + indus_scaled + nox_scaled +
## rm_scaled + age_scaled + dis_scaled + rad_scaled + tax_scaled +
## ptratio_scaled + age_group + lstat_medv_interact + tax_rad_interact
##
##
## Step: AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
## ptratio + lstat + medv + predicted_prob + logit + log_medv +
## log_lstat + log_dis + zn_scaled + indus_scaled + nox_scaled +
## rm_scaled + age_scaled + dis_scaled + rad_scaled + tax_scaled +
## age_group + lstat_medv_interact + tax_rad_interact
##
##
## Step: AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
## ptratio + lstat + medv + predicted_prob + logit + log_medv +
## log_lstat + log_dis + zn_scaled + indus_scaled + nox_scaled +
## rm_scaled + age_scaled + dis_scaled + rad_scaled + age_group +
## lstat_medv_interact + tax_rad_interact
##
##
## Step: AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
## ptratio + lstat + medv + predicted_prob + logit + log_medv +
## log_lstat + log_dis + zn_scaled + indus_scaled + nox_scaled +
## rm_scaled + age_scaled + dis_scaled + age_group + lstat_medv_interact +
## tax_rad_interact
##
##
## Step: AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
## ptratio + lstat + medv + predicted_prob + logit + log_medv +
## log_lstat + log_dis + zn_scaled + indus_scaled + nox_scaled +
## rm_scaled + age_scaled + age_group + lstat_medv_interact +
## tax_rad_interact
##
##
## Step: AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
## ptratio + lstat + medv + predicted_prob + logit + log_medv +
## log_lstat + log_dis + zn_scaled + indus_scaled + nox_scaled +
## rm_scaled + age_group + lstat_medv_interact + tax_rad_interact
##
##
## Step: AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
## ptratio + lstat + medv + predicted_prob + logit + log_medv +
## log_lstat + log_dis + zn_scaled + indus_scaled + nox_scaled +
## age_group + lstat_medv_interact + tax_rad_interact
##
##
## Step: AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
## ptratio + lstat + medv + predicted_prob + logit + log_medv +
## log_lstat + log_dis + zn_scaled + indus_scaled + age_group +
## lstat_medv_interact + tax_rad_interact
##
##
## Step: AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
## ptratio + lstat + medv + predicted_prob + logit + log_medv +
## log_lstat + log_dis + zn_scaled + age_group + lstat_medv_interact +
## tax_rad_interact
##
##
## Step: AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
## ptratio + lstat + medv + predicted_prob + logit + log_medv +
## log_lstat + log_dis + age_group + lstat_medv_interact + tax_rad_interact
##
##
## Step: AIC=215.97
## target ~ zn + indus + chas + nox + rm + age + dis + rad + tax +
## ptratio + lstat + medv + predicted_prob + log_medv + log_lstat +
## log_dis + age_group + lstat_medv_interact + tax_rad_interact
##
## Df Deviance AIC
## - indus 1 173.99 213.99
## - lstat_medv_interact 1 174.01 214.01
## - log_medv 1 174.07 214.07
## - tax 1 174.13 214.13
## - log_lstat 1 174.16 214.16
## - zn 1 174.19 214.19
## - predicted_prob 1 174.22 214.22
## - chas 1 174.31 214.31
## - age_group 2 176.44 214.44
## - tax_rad_interact 1 175.07 215.07
## - medv 1 175.09 215.09
## - lstat 1 175.27 215.27
## - rm 1 175.86 215.86
## <none> 173.97 215.97
## - age 1 176.11 216.11
## - rad 1 177.98 217.98
## - dis 1 180.14 220.14
## - log_dis 1 183.16 223.16
## - ptratio 1 186.62 226.62
## - nox 1 189.02 229.02
##
## Step: AIC=213.99
## target ~ zn + chas + nox + rm + age + dis + rad + tax + ptratio +
## lstat + medv + predicted_prob + log_medv + log_lstat + log_dis +
## age_group + lstat_medv_interact + tax_rad_interact
##
## Df Deviance AIC
## - lstat_medv_interact 1 174.03 212.03
## - log_medv 1 174.08 212.08
## - tax 1 174.14 212.14
## - log_lstat 1 174.17 212.17
## - zn 1 174.22 212.22
## - predicted_prob 1 174.29 212.29
## - chas 1 174.40 212.40
## - age_group 2 176.49 212.49
## - tax_rad_interact 1 175.07 213.07
## - medv 1 175.09 213.09
## - lstat 1 175.27 213.27
## - rm 1 175.90 213.90
## <none> 173.99 213.99
## - age 1 176.11 214.11
## + indus 1 173.97 215.97
## + indus_scaled 1 173.97 215.97
## - rad 1 178.36 216.36
## - dis 1 180.95 218.95
## - log_dis 1 185.17 223.17
## - ptratio 1 186.71 224.71
## - nox 1 189.36 227.36
##
## Step: AIC=212.03
## target ~ zn + chas + nox + rm + age + dis + rad + tax + ptratio +
## lstat + medv + predicted_prob + log_medv + log_lstat + log_dis +
## age_group + tax_rad_interact
##
## Df Deviance AIC
## - tax 1 174.22 210.22
## - log_medv 1 174.24 210.24
## - zn 1 174.29 210.29
## - predicted_prob 1 174.31 210.31
## - chas 1 174.44 210.44
## - age_group 2 176.65 210.65
## - tax_rad_interact 1 175.11 211.11
## - rm 1 175.91 211.91
## - lstat 1 175.97 211.97
## <none> 174.03 212.03
## - log_lstat 1 176.10 212.10
## - age 1 176.11 212.11
## - medv 1 176.32 212.32
## + lstat_medv_interact 1 173.99 213.99
## + indus_scaled 1 174.01 214.01
## + indus 1 174.01 214.01
## - rad 1 178.43 214.43
## - dis 1 180.97 216.97
## - log_dis 1 185.24 221.24
## - ptratio 1 186.71 222.71
## - nox 1 189.74 225.74
##
## Step: AIC=210.22
## target ~ zn + chas + nox + rm + age + dis + rad + ptratio + lstat +
## medv + predicted_prob + log_medv + log_lstat + log_dis +
## age_group + tax_rad_interact
##
## Df Deviance AIC
## - log_medv 1 174.40 208.40
## - zn 1 174.44 208.44
## - chas 1 174.59 208.59
## - age_group 2 176.70 208.70
## - predicted_prob 1 174.70 208.70
## - lstat 1 176.13 210.13
## - rm 1 176.19 210.19
## <none> 174.22 210.22
## - age 1 176.24 210.24
## - log_lstat 1 176.25 210.25
## - medv 1 176.48 210.48
## - tax_rad_interact 1 176.71 210.71
## + tax 1 174.03 212.03
## + logit 1 174.03 212.03
## + tax_scaled 1 174.03 212.03
## + lstat_medv_interact 1 174.14 212.14
## + indus 1 174.22 212.22
## + indus_scaled 1 174.22 212.22
## - rad 1 179.80 213.80
## - dis 1 182.17 216.17
## - log_dis 1 186.56 220.56
## - ptratio 1 187.37 221.37
## - nox 1 189.82 223.82
##
## Step: AIC=208.4
## target ~ zn + chas + nox + rm + age + dis + rad + ptratio + lstat +
## medv + predicted_prob + log_lstat + log_dis + age_group +
## tax_rad_interact
##
## Df Deviance AIC
## - zn 1 174.62 206.62
## - age_group 2 176.72 206.72
## - chas 1 174.72 206.72
## - predicted_prob 1 174.78 206.78
## - rm 1 176.20 208.20
## <none> 174.40 208.40
## - age 1 176.55 208.55
## - tax_rad_interact 1 176.79 208.79
## - log_lstat 1 177.39 209.39
## - lstat 1 177.78 209.78
## + log_medv 1 174.22 210.22
## + tax 1 174.24 210.24
## + tax_scaled 1 174.24 210.24
## + logit 1 174.24 210.24
## + lstat_medv_interact 1 174.29 210.29
## + indus_scaled 1 174.40 210.40
## + indus 1 174.40 210.40
## - rad 1 179.93 211.93
## - medv 1 180.19 212.19
## - dis 1 182.56 214.56
## - log_dis 1 187.24 219.24
## - ptratio 1 187.90 219.90
## - nox 1 192.05 224.05
##
## Step: AIC=206.62
## target ~ chas + nox + rm + age + dis + rad + ptratio + lstat +
## medv + predicted_prob + log_lstat + log_dis + age_group +
## tax_rad_interact
##
## Df Deviance AIC
## - age_group 2 176.75 204.75
## - chas 1 174.97 204.97
## - predicted_prob 1 175.65 205.65
## <none> 174.62 206.62
## - age 1 176.66 206.66
## - rm 1 176.72 206.72
## - tax_rad_interact 1 176.81 206.81
## - log_lstat 1 177.53 207.53
## - lstat 1 177.88 207.88
## + logit 1 174.25 208.25
## + zn 1 174.40 208.40
## + zn_scaled 1 174.40 208.40
## + log_medv 1 174.44 208.44
## + tax_scaled 1 174.51 208.51
## + tax 1 174.51 208.51
## + lstat_medv_interact 1 174.52 208.52
## + indus 1 174.62 208.62
## + indus_scaled 1 174.62 208.62
## - rad 1 179.93 209.93
## - medv 1 180.34 210.34
## - dis 1 185.08 215.08
## - log_dis 1 188.78 218.78
## - ptratio 1 190.24 220.24
## - nox 1 192.08 222.08
##
## Step: AIC=204.75
## target ~ chas + nox + rm + age + dis + rad + ptratio + lstat +
## medv + predicted_prob + log_lstat + log_dis + tax_rad_interact
##
## Df Deviance AIC
## - chas 1 177.01 203.01
## - predicted_prob 1 177.90 203.90
## - rm 1 178.59 204.59
## - tax_rad_interact 1 178.62 204.62
## <none> 176.75 204.75
## - log_lstat 1 180.45 206.45
## + age_group 2 174.62 206.62
## + logit 1 176.68 206.68
## + tax 1 176.72 206.72
## + tax_scaled 1 176.72 206.72
## + zn 1 176.72 206.72
## + zn_scaled 1 176.72 206.72
## + log_medv 1 176.73 206.73
## + indus_scaled 1 176.74 206.74
## + indus 1 176.74 206.74
## + lstat_medv_interact 1 176.75 206.75
## - lstat 1 181.07 207.07
## - age 1 181.25 207.25
## - rad 1 181.66 207.66
## - medv 1 181.92 207.92
## - dis 1 186.37 212.37
## - log_dis 1 189.72 215.72
## - ptratio 1 191.70 217.70
## - nox 1 193.03 219.03
##
## Step: AIC=203.01
## target ~ nox + rm + age + dis + rad + ptratio + lstat + medv +
## predicted_prob + log_lstat + log_dis + tax_rad_interact
##
## Df Deviance AIC
## - predicted_prob 1 178.45 202.45
## - tax_rad_interact 1 178.96 202.96
## - rm 1 178.99 202.99
## <none> 177.01 203.01
## + chas 1 176.75 204.75
## - log_lstat 1 180.86 204.86
## + logit 1 176.94 204.94
## + indus 1 176.96 204.96
## + indus_scaled 1 176.96 204.96
## + age_group 2 174.97 204.97
## + zn_scaled 1 176.97 204.97
## + zn 1 176.97 204.97
## + tax_scaled 1 176.99 204.99
## + tax 1 176.99 204.99
## + log_medv 1 177.00 205.00
## + lstat_medv_interact 1 177.01 205.01
## - age 1 181.59 205.59
## - lstat 1 181.62 205.62
## - rad 1 182.09 206.09
## - medv 1 182.23 206.23
## - dis 1 187.05 211.05
## - log_dis 1 190.08 214.08
## - ptratio 1 191.70 215.70
## - nox 1 193.04 217.04
##
## Step: AIC=202.45
## target ~ nox + rm + age + dis + rad + ptratio + lstat + medv +
## log_lstat + log_dis + tax_rad_interact
##
## Df Deviance AIC
## <none> 178.45 202.45
## - rm 1 180.84 202.84
## + predicted_prob 1 177.01 203.01
## + logit 1 177.56 203.56
## + zn 1 177.73 203.73
## + zn_scaled 1 177.73 203.73
## + chas 1 177.90 203.90
## + indus 1 178.15 204.15
## + indus_scaled 1 178.15 204.15
## + tax_scaled 1 178.24 204.24
## + tax 1 178.24 204.24
## + age_group 2 176.26 204.26
## + lstat_medv_interact 1 178.36 204.36
## + log_medv 1 178.41 204.41
## - log_lstat 1 182.72 204.72
## - lstat 1 183.31 205.31
## - tax_rad_interact 1 183.89 205.89
## - medv 1 185.16 207.16
## - age 1 188.21 210.21
## - dis 1 191.26 213.26
## - ptratio 1 193.88 215.88
## - log_dis 1 198.19 220.19
## - rad 1 199.02 221.02
## - nox 1 272.24 294.24
# Logistic Regression Model 3 (With Transformations & Interactions)
model3 <- glm(target ~ log_medv + lstat + nox + ptratio + age_group, data=crime_training_df, family=binomial)
#summary(model3)
#AIC(model3) # Compare AIC
# Summary of models
#summary(model1)
summary(model1)
##
## Call:
## glm(formula = target ~ ., family = binomial, data = crime_training_df)
##
## Coefficients: (10 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.234e+01 4.713e+01 -0.686 0.492539
## zn -1.481e-02 3.416e-02 -0.433 0.664664
## indus 8.222e-03 5.831e-02 0.141 0.887866
## chas 4.753e-01 8.114e-01 0.586 0.558009
## nox 4.628e+01 1.712e+01 2.703 0.006869 **
## rm -1.130e+00 8.284e-01 -1.363 0.172751
## age 3.708e-02 2.566e-02 1.445 0.148416
## dis -2.587e+00 1.109e+00 -2.333 0.019666 *
## rad 1.001e+00 5.071e-01 1.974 0.048395 *
## tax -1.511e-03 3.831e-03 -0.394 0.693231
## ptratio 4.609e-01 1.378e-01 3.345 0.000823 ***
## lstat 2.604e-01 2.272e-01 1.146 0.251753
## medv 2.868e-01 2.685e-01 1.068 0.285532
## predicted_prob 1.249e+00 2.387e+00 0.523 0.600865
## logit NA NA NA NA
## log_medv -4.719e+00 1.470e+01 -0.321 0.748087
## log_lstat -5.993e+00 1.358e+01 -0.441 0.659011
## log_dis 1.645e+01 5.815e+00 2.829 0.004672 **
## zn_scaled NA NA NA NA
## indus_scaled NA NA NA NA
## nox_scaled NA NA NA NA
## rm_scaled NA NA NA NA
## age_scaled NA NA NA NA
## dis_scaled NA NA NA NA
## rad_scaled NA NA NA NA
## tax_scaled NA NA NA NA
## ptratio_scaled NA NA NA NA
## age_groupMiddle-aged -1.761e+00 1.297e+00 -1.358 0.174571
## age_groupOld -1.291e+00 1.729e+00 -0.747 0.455306
## lstat_medv_interact 7.418e-01 3.595e+00 0.206 0.836521
## tax_rad_interact -9.495e-04 7.463e-04 -1.272 0.203281
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 173.97 on 445 degrees of freedom
## AIC: 215.97
##
## Number of Fisher Scoring iterations: 10
#summary(model3)
Evaluate model performance and select the best model based on multiple criteria.
Evaluation Metrics: Accuracy: (TP + TN) / (TP + TN + FP + FN) Precision: TP / (TP + FP) Recall (Sensitivity): TP / (TP + FN) Specificity: TN / (TN + FP) F1 Score: 2 * (Precision * Recall) / (Precision + Recall) AUC-ROC Curve: Evaluate model discrimination.
# Load necessary library
library(caret)
library(pROC)
# Predict on training data
pred_probs <- predict(model1, type="response")
pred_classes <- ifelse(pred_probs > 0.5, 1, 0)
# Confusion Matrix
conf_matrix <- table(Predicted=pred_classes, Actual=crime_training_df$target)
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 223 15
## 1 14 214
# Compute accuracy, precision, recall, F1-score
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2,2] / sum(conf_matrix[,2])
recall <- conf_matrix[2,2] / sum(conf_matrix[2,])
f1_score <- 2 * (precision * recall) / (precision + recall)
f1_score
## [1] 0.9365427
# Print performance metrics
cat("Accuracy:", accuracy, "\nPrecision:", precision, "\nRecall:", recall, "\nF1 Score:", f1_score)
## Accuracy: 0.9377682
## Precision: 0.9344978
## Recall: 0.9385965
## F1 Score: 0.9365427
pred_prob_model1 <- predict(model1, type = "response")
pred_prob_model2 <- predict(model2, type = "response")
pred_prob_model3 <- predict(model3, type = "response")
# Convert probabilities into binary classifications (Threshold = 0.5)
pred_class_model1 <- ifelse(pred_prob_model1 > 0.5, 1, 0)
pred_class_model2 <- ifelse(pred_prob_model2 > 0.5, 1, 0)
pred_class_model3 <- ifelse(pred_prob_model3 > 0.5, 1, 0)
conf_matrix_model1 <- table(Predicted = pred_class_model1, Actual = crime_training_df$target)
conf_matrix_model2 <- table(Predicted = pred_class_model2, Actual = crime_training_df$target)
conf_matrix_model3 <- table(Predicted = pred_class_model3, Actual = crime_training_df$target)
print(conf_matrix_model1)
## Actual
## Predicted 0 1
## 0 223 15
## 1 14 214
print(conf_matrix_model2)
## Actual
## Predicted 0 1
## 0 222 19
## 1 15 210
print(conf_matrix_model3)
## Actual
## Predicted 0 1
## 0 207 39
## 1 30 190
# Define a function to compute accuracy, precision, recall, and F1-score
evaluate_model <- function(conf_matrix) {
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
precision <- conf_matrix[2,2] / sum(conf_matrix[,2])
recall <- conf_matrix[2,2] / sum(conf_matrix[2,])
f1_score <- 2 * ((precision * recall) / (precision + recall))
return(list(Accuracy = accuracy, Precision = precision, Recall = recall, F1_Score = f1_score))
}
eval_model1 <- evaluate_model(conf_matrix_model1)
eval_model2 <- evaluate_model(conf_matrix_model2)
eval_model3 <- evaluate_model(conf_matrix_model3)
print(eval_model1)
## $Accuracy
## [1] 0.9377682
##
## $Precision
## [1] 0.9344978
##
## $Recall
## [1] 0.9385965
##
## $F1_Score
## [1] 0.9365427
print(eval_model2)
## $Accuracy
## [1] 0.9270386
##
## $Precision
## [1] 0.9170306
##
## $Recall
## [1] 0.9333333
##
## $F1_Score
## [1] 0.9251101
print(eval_model3)
## $Accuracy
## [1] 0.8519313
##
## $Precision
## [1] 0.8296943
##
## $Recall
## [1] 0.8636364
##
## $F1_Score
## [1] 0.8463252
roc_model1 <- roc(crime_training_df$target, pred_prob_model1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_model2 <- roc(crime_training_df$target, pred_prob_model2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_model3 <- roc(crime_training_df$target, pred_prob_model3)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_model1
##
## Call:
## roc.default(response = crime_training_df$target, predictor = pred_prob_model1)
##
## Data: pred_prob_model1 in 237 controls (crime_training_df$target 0) < 229 cases (crime_training_df$target 1).
## Area under the curve: 0.9782
roc_model2
##
## Call:
## roc.default(response = crime_training_df$target, predictor = pred_prob_model2)
##
## Data: pred_prob_model2 in 237 controls (crime_training_df$target 0) < 229 cases (crime_training_df$target 1).
## Area under the curve: 0.9776
roc_model3
##
## Call:
## roc.default(response = crime_training_df$target, predictor = pred_prob_model3)
##
## Data: pred_prob_model3 in 237 controls (crime_training_df$target 0) < 229 cases (crime_training_df$target 1).
## Area under the curve: 0.9472
# Plot ROC curves for comparison
plot(roc_model1, col="red", main="ROC Curves for Logistic Models")
plot(roc_model2, col="blue", add=TRUE)
plot(roc_model3, col="green", add=TRUE)
legend("bottomright", legend=c("Model 1", "Model 2", "Model 3"), col=c("red", "blue", "green"), lwd=2)
Model Accuracy Precision Recall F1 Score AUC Model 1 0.938 0.934 0.939
0.937 0.9782 Model 2 0.927 0.917 0.933 0.925 0.9776 Model 3 0.852 0.830
0.864 0.846 0.9472
Best Choice: Model 1 Model 1 is the best choice because:
Highest Accuracy (0.938) → It makes the fewest classification errors. Highest F1 Score (0.937) → It balances precision and recall effectively. Highest AUC (0.9782) → It has the best ability to distinguish between classes
Model 1 is the most reliable, most precise, and best at distinguishing high-crime neighborhoods. Model 2 performs well but has slightly lower Accuracy (0.927) and AUC (0.9776). Model 3 performs significantly worse, with much lower Accuracy (0.852) and AUC (0.9472).
What Does “Fewest Classification Errors” Mean? A classification error happens when the model predicts the wrong category:
False Negative (FN) → Model predicts 0 (low crime), but actual crime rate is high (1). False Positive (FP) → Model predicts 1 (high crime), but actual crime rate is low (0). Higher accuracy means fewer FPs & FNs → More correct predictions.
Accuracy measures overall correctness of the classification model. It represents the proportion of correct predictions (both positive and negative) out of all predictions.
Accuracy= (TP+TN)/(TP+FP+TN+FN)
False Negatives Are Dangerous (Crime exists but is predicted as “safe”) Consequence: A high-crime area is misclassified as low-crime → Authorities don’t take necessary action.
Police resources may not be allocated where crime is actually high. Residents feel falsely safe but face real crime threats. Crime could increase due to lack of intervention. False Positives Cause Unnecessary Fear & Resource Waste (Safe area misclassified as high-crime) Consequence: A low-crime area is misclassified as high-crime → Unnecessary actions are taken.
Increased policing in safe areas, wasting law enforcement resources. Real estate values may drop due to incorrect classification. Residents may feel unsafe, even though crime is low.
Bottom Line: Highest accuracy (0.938) = Best model for crime prevention
F1 Score of 0.937 means that model balances Precision and Recall very well. F1 = 2 x (Precision x Sensitivity)/(Precision + Sensitivity)
Precision (“When the model says high crime, is it actually high crime?”) High Precision = Few False Positives (FP) → Model doesn’t falsely label safe areas as high crime.
Recall (“Did the model find all the high-crime areas?”) High Recall = Few False Negatives (FN) → Model doesn’t miss actual high-crime areas.
A high F1 Score (0.937) means: Few False Positives (Safe areas aren’t wrongly classified as high crime) Few False Negatives (Actual high-crime areas are correctly identified) The model is well-balanced and reliable.
ROC (Receiver Operating Characteristic) curve is a graph that shows how well a model can separate two classes (e.g., high-crime vs. low-crime areas). AUC (Area Under the Curve) measures how well your model separates two classes of high-crime (1) vs. low-crime (0) areas.
A high AUC (0.9782) means:
Model correctly distinguishes between crime-prone and safe areas most of the time. It rarely confuses high-crime neighborhoods with low-crime ones. It performs better than random guessing (which has an AUC of 0.5). In simple terms: AUC = 0.9782, meaning model is very effective at separating high-crime and low-crime areas!
Once the best model is selected, we use it for prediction on crime_evaluation_df.
# Handle missing values (if any)
#crime_training_df$medv[is.na(crime_training_df$medv)] <- median(crime_training_df$medv, na.rm = TRUE)
# Fill missing values with median
crime_evaluation_df$medv[is.na(crime_evaluation_df$medv)] <- median(crime_evaluation_df$medv, na.rm = TRUE)
crime_evaluation_df$lstat[is.na(crime_evaluation_df$lstat)] <- median(crime_evaluation_df$lstat, na.rm = TRUE)
crime_evaluation_df$dis[is.na(crime_evaluation_df$dis)] <- median(crime_evaluation_df$dis, na.rm = TRUE)
# Log transformation
#crime_training_df$log_medv <- log(crime_training_df$medv + 1)
crime_evaluation_df <- crime_evaluation_df %>%
mutate(
log_medv = log(medv + 1), # Avoid log(0)
log_lstat = log(lstat + 1),
log_dis = log(dis + 1)
)
# Standardization
#crime_training_df$zn <- scale(crime_training_df$zn)
#crime_training_df$indus <- scale(crime_training_df$indus)
#Standardization (Normalize Continuous Variables)
#Standardization ensures that variables are on the same scale, improving model stability.
crime_evaluation_df <- crime_evaluation_df %>%
mutate(
zn_scaled = scale(zn),
indus_scaled = scale(indus),
nox_scaled = scale(nox),
rm_scaled = scale(rm),
age_scaled = scale(age),
dis_scaled = scale(dis),
rad_scaled = scale(rad),
tax_scaled = scale(tax),
ptratio_scaled = scale(ptratio)
)
# Create categorical age groups bins
crime_evaluation_df$age_group <- cut(crime_evaluation_df$age, breaks=c(0, 30, 60, 100), labels=c("Young", "Middle-aged", "Old"))
# Create Interaction Terms
crime_evaluation_df <- crime_evaluation_df %>%
mutate(
lstat_medv_interact = log_lstat * log_medv, # Income & housing price interaction
tax_rad_interact = tax * rad # Tax burden & highway accessibility
)
colnames(crime_evaluation_df)
## [1] "zn" "indus" "chas"
## [4] "nox" "rm" "age"
## [7] "dis" "rad" "tax"
## [10] "ptratio" "lstat" "medv"
## [13] "log_medv" "log_lstat" "log_dis"
## [16] "zn_scaled" "indus_scaled" "nox_scaled"
## [19] "rm_scaled" "age_scaled" "dis_scaled"
## [22] "rad_scaled" "tax_scaled" "ptratio_scaled"
## [25] "age_group" "lstat_medv_interact" "tax_rad_interact"
#colnames(crime_training_df)
# Predict probabilities using model1
eval_pred_prob <- predict(model2, newdata = crime_evaluation_df, type = "response")
# Convert probabilities to binary class (0 or 1) using a threshold of 0.5
eval_pred_class <- ifelse(eval_pred_prob > 0.5, 1, 0)
# Add predictions to the evaluation data
crime_evaluation_df$predicted_prob <- eval_pred_prob
crime_evaluation_df$predicted_class <- eval_pred_class
file_path <- "C:/Users/Uzma/Downloads/crime_evaluation_df.csv"
# Write dataframe to CSV
write.csv(crime_evaluation_df, file = file_path, row.names = FALSE)
crime_evaluation_df
## # A tibble: 40 × 29
## zn indus chas nox rm age dis rad tax ptratio lstat medv
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 7.07 0 0.469 7.18 61.1 4.97 2 242 17.8 4.03 34.7
## 2 0 8.14 0 0.538 6.10 84.5 4.46 4 307 21 10.3 18.2
## 3 0 8.14 0 0.538 6.50 94.4 4.45 4 307 21 12.8 18.4
## 4 0 8.14 0 0.538 5.95 82 3.99 4 307 21 27.7 13.2
## 5 0 5.96 0 0.499 5.85 41.5 3.93 5 279 19.2 8.77 21
## 6 25 5.13 0 0.453 5.74 66.2 7.23 8 284 19.7 13.2 18.7
## 7 25 5.13 0 0.453 5.97 93.4 6.82 8 284 19.7 14.4 16
## 8 0 4.49 0 0.449 6.63 56.1 4.44 3 247 18.5 6.53 26.6
## 9 0 4.49 0 0.449 6.12 56.8 3.75 3 247 18.5 8.44 22.2
## 10 0 2.89 0 0.445 6.16 69.6 3.50 2 276 18 11.3 21.4
## # ℹ 30 more rows
## # ℹ 17 more variables: log_medv <dbl>, log_lstat <dbl>, log_dis <dbl>,
## # zn_scaled <dbl[,1]>, indus_scaled <dbl[,1]>, nox_scaled <dbl[,1]>,
## # rm_scaled <dbl[,1]>, age_scaled <dbl[,1]>, dis_scaled <dbl[,1]>,
## # rad_scaled <dbl[,1]>, tax_scaled <dbl[,1]>, ptratio_scaled <dbl[,1]>,
## # age_group <fct>, lstat_medv_interact <dbl>, tax_rad_interact <dbl>,
## # predicted_prob <dbl>, predicted_class <dbl>