library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nnet)
library(skimr)
In this homework assignment, you will explore, analyze and model a data set containing information on crime for various neighborhoods of a major city.
Each record has a response variable indicating whether or not the crime rate is above the median crime rate (1) or not (0).
Your objective is to build a binary logistic regression model on the training data set to predict whether the neighborhood will be at risk for high crime levels. You will provide classifications and probabilities for the evaluation data set using your binary logistic regression model. You can only use the variables given to you (or, variables that you derive from the variables provided).
Predictors zn: proportion of residential land zoned for large lots (over 25,000 square feet) indus: proportion of non-retail business acres per suburb chas: a dummy var. for whether the suburb borders the Charles River (1) or not (0) nox: nitrogen oxides concentration (parts per 10 million) rm: average number of rooms per dwelling age: proportion of owner-occupied units built prior to 1940 dis: weighted mean of distances to five Boston employment centers rad: index of accessibility to radial highways tax: full-value property-tax rate per $10,000 ptratio: pupil-teacher ratio by town lstat: lower status of the population (percent) medv: median value of owner-occupied homes in $1,000s
target: whether the crime rate is above the median crime rate (1) or not (0) (response variable)
crime_train_data <- read_csv("crime-training-data_modified.csv")
## Rows: 466 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (13): zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, lstat, medv...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
crime_eval_data <- read_csv("crime-evaluation-data_modified.csv")
## Rows: 40 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (12): zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, lstat, medv
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(crime_train_data)
## # A tibble: 6 × 13
## zn indus chas nox rm age dis rad tax ptratio lstat medv
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 19.6 0 0.605 7.93 96.2 2.05 5 403 14.7 3.7 50
## 2 0 19.6 1 0.871 5.40 100 1.32 5 403 14.7 26.8 13.4
## 3 0 18.1 0 0.74 6.48 100 1.98 24 666 20.2 18.8 15.4
## 4 30 4.93 0 0.428 6.39 7.8 7.04 6 300 16.6 5.19 23.7
## 5 0 2.46 0 0.488 7.16 92.2 2.70 3 193 17.8 4.82 37.9
## 6 0 8.56 0 0.52 6.78 71.3 2.86 5 384 20.9 7.67 26.5
## # ℹ 1 more variable: target <dbl>
# View summary stats and missing values
skim_df_as_data_frame <- skim(crime_train_data) %>% arrange(desc(n_missing))
skim_df_as_data_frame <- skim_df_as_data_frame %>% mutate(
numeric.mean = round(numeric.mean, 2),
numeric.sd = round(numeric.sd, 2),
numeric.p0 = round(numeric.p0, 2),
numeric.p25 = round(numeric.p25, 2),
numeric.p50 = round(numeric.p50, 2),
numeric.p75 = round(numeric.p75, 2),
numeric.p100 = round(numeric.p100, 2)# and so on for other numeric summaries
)
skim_df_as_data_frame
| Name | crime_train_data |
| Number of rows | 466 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| numeric | 13 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| zn | 0 | 1 | 11.58 | 23.36 | 0.00 | 0.00 | 0.00 | 16.25 | 100.00 | ▇▁▁▁▁ |
| indus | 0 | 1 | 11.11 | 6.85 | 0.46 | 5.15 | 9.69 | 18.10 | 27.74 | ▇▆▁▇▁ |
| chas | 0 | 1 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| nox | 0 | 1 | 0.55 | 0.12 | 0.39 | 0.45 | 0.54 | 0.62 | 0.87 | ▇▇▅▃▁ |
| rm | 0 | 1 | 6.29 | 0.70 | 3.86 | 5.89 | 6.21 | 6.63 | 8.78 | ▁▂▇▂▁ |
| age | 0 | 1 | 68.37 | 28.32 | 2.90 | 43.88 | 77.15 | 94.10 | 100.00 | ▂▂▂▃▇ |
| dis | 0 | 1 | 3.80 | 2.11 | 1.13 | 2.10 | 3.19 | 5.21 | 12.13 | ▇▅▂▁▁ |
| rad | 0 | 1 | 9.53 | 8.69 | 1.00 | 4.00 | 5.00 | 24.00 | 24.00 | ▇▂▁▁▃ |
| tax | 0 | 1 | 409.50 | 167.90 | 187.00 | 281.00 | 334.50 | 666.00 | 711.00 | ▇▇▅▁▇ |
| ptratio | 0 | 1 | 18.40 | 2.20 | 12.60 | 16.90 | 18.90 | 20.20 | 22.00 | ▁▃▅▅▇ |
| lstat | 0 | 1 | 12.63 | 7.10 | 1.73 | 7.04 | 11.35 | 16.93 | 37.97 | ▇▇▅▂▁ |
| medv | 0 | 1 | 22.59 | 9.24 | 5.00 | 17.02 | 21.20 | 25.00 | 50.00 | ▂▇▅▁▁ |
| target | 0 | 1 | 0.49 | 0.50 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▇ |
# save output to csv
write.csv(skim_df_as_data_frame, "H3_skim_output.csv", row.names = FALSE)
There are no missing values in the data set.
Target Variable: Number of neighborhoods whose crime rate is above or below the median
crime_train_data %>% mutate(target_renamed = ifelse(target == 0, "Below", "Above")) %>% ggplot(aes(x = as_factor(target_renamed))) + geom_bar(color = "#686868") +
geom_text(aes(label = after_stat(count)), stat = "count", vjust = 10, color = "white") +
labs(title = "Number of Neighborhoods Above or Below Median Crime Rate",
y = "Count",
x = "Crime Rate") +
theme_minimal()
columns <- colnames(crime_train_data)
columns <- columns[columns != "chas"]
crime_train_data %>% mutate(target_renamed = ifelse(target == 0, "Below", "Above")) %>%
pivot_longer(cols = columns[-length(columns)], names_to = "Feature", values_to = "Value") %>% ggplot(aes(x = as_factor(target_renamed), y = Value)) + geom_boxplot(color = "#686868") +
facet_wrap(~ Feature, nrow = 4, ncol = 3, scales = "free") + labs(title = "Values") +
labs(title = "Distribution Between Neighborhood Crime Rates ",
y = "",
x = "Crime Rate") +
theme_minimal()
# Evaluate variable 'chas' separately
crime_train_data %>% mutate(target_renamed = ifelse(target == 0, "Below", "Above")) %>%
ggplot(aes(x = as_factor(target_renamed), fill = as_factor(chas))) + geom_bar(position = "stack") +
labs(title = "Number of Neighborhoods Boardering the Charles River in each Target Category",
y = "Count",
x = "Crime Rate",
fill="Boarders River") +
theme_minimal()
There is a slightly higher proportion of neighborhoods boarding the
Charles River in the group with crime rates above the median.
crime_train_data %>% mutate(target_renamed = ifelse(target == 0, "Below", "Above")) %>%
pivot_longer(cols = columns[-length(columns)], names_to = "Feature", values_to = "Value") %>% ggplot(aes(x = Value, fill = as_factor(target_renamed))) + geom_histogram() +
facet_wrap(~ Feature, nrow = 4, ncol = 3, scales = "free") + labs(title = "Values") +
labs(title = "Distribution Between Neighborhood Crime Rates ",
y = "",
fill = "Crime Rate") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
The rad variable contains many observations with an
index of 24 all belonging to the neighborhoods whose crime index is
above 24. We also see similar distributions in the tax, ptratio, and
indus variables. A large portion of observations in at the higher end of
the range of values are in neighborhoods whose crime rate is above the
median.
crime_train_data %>% mutate(target_renamed = ifelse(target == 0, "Below", "Above"),
rad_cat = factor(ifelse(rad < 13, 0, 1))) %>%
ggplot(aes(x = as_factor(target_renamed), fill = rad_cat)) + geom_bar(position = "stack") +
labs(title = "Neighborhoods Above or Below 'rad' Index of 13 each Target Category",
y = "Count",
x = "Crime Rate",
fill="rad index") +
theme_minimal()
The zn variable contains mostly zeros indicating that
the proportion of residential land zoned for large lots (over 25,000
square feet) is 0. This continuous variable can be discretized to
include 2 categories - 0 or above 0
crime_train_data %>% mutate(target_renamed = ifelse(target == 0, "Below", "Above"),
zn_cat = factor(ifelse(zn == 0, 0, 1))) %>%
ggplot(aes(x = as_factor(target_renamed), fill = zn_cat)) + geom_bar(position = "stack") +
labs(title = "Number of Neighborhoods with No Land Zoned for Large Lot Spaces",
y = "Count",
x = "Crime Rate",
fill="Large Lot Spaces") +
theme_minimal()
library(corrplot)
## corrplot 0.95 loaded
cor_coef <- cor(crime_train_data[-c(3, 13)])
corrplot::corrplot(cor_coef, method = "number", title = "Correlations Bewteen Continuous Predictors",
order = "hclust", number.cex = 0.75, type = "lower")
Code categorical variables into factor variables
# chas
crime_train_data$chas <- as_factor(crime_train_data$chas)
#target
crime_train_data$target <- as_factor(crime_train_data$target)
Assess distribution of the scaled and Center the continuous predictor variables
# Scaling (centered value/sd) is performed after centering (x - mean)
scaled_data <- crime_train_data[-c(3, 13)] %>% scale(center = TRUE, scale = TRUE) %>% as_tibble()
cbind(scaled_data, crime_train_data[13]) %>%
mutate(target_renamed = ifelse(target == 0, "Below", "Above")) %>%
pivot_longer(cols = columns[-length(columns)], names_to = "Feature", values_to = "Value") %>%
ggplot(aes(x = Value, fill = as_factor(target_renamed))) + geom_histogram() +
facet_wrap(~ Feature, nrow = 4, ncol = 3, scales = "free") + labs(title = "Values") +
labs(title = "Distribution Between Neighborhood Crime Rates ",
y = "",
fill = "Crime Rate") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom 1.0.9 ✔ rsample 1.3.1
## ✔ dials 1.4.2 ✔ tailor 0.1.0
## ✔ infer 1.0.9 ✔ tune 2.0.0
## ✔ modeldata 1.5.1 ✔ workflows 1.3.0
## ✔ parsnip 1.3.3 ✔ workflowsets 1.1.1
## ✔ recipes 1.3.1 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
set.seed(100)
# Split data into train and test sets
train_test_split <- initial_split(crime_train_data, prop = 0.7)
## TRAIN SET
train_df <- training(train_test_split)
# Predictors
x_train <- dplyr::select(train_df, -target)
# Center and scale predictors
# Identify numeric columns
numeric_cols <- sapply(x_train, is.numeric)
# Apply scale() only to numeric columns
scaled_df <- as.data.frame(lapply(x_train[, numeric_cols], scale))
# You can then combine it with the non-numeric columns if needed
X_train_scaled <- cbind(x_train[, !numeric_cols], scaled_df)
# Target
y_train <- train_df$target
training_data <- data.frame(target = y_train, X_train_scaled)
## TEST SET
test_df <- testing(train_test_split)
# Predictors
x_test <- dplyr::select(test_df, -target)
# Center and scale predictors
# Identify numeric columns
numeric_cols <- sapply(x_test, is.numeric)
# Apply scale() only to numeric columns
scaled_df_test <- as.data.frame(lapply(x_test[, numeric_cols], scale))
# You can then combine it with the non-numeric columns if needed
X_test_scaled <- cbind(x_test[, !numeric_cols], scaled_df_test)
# Target
y_test <- test_df$target
testing_data <- data.frame(target = y_test, X_test_scaled)
logreg1 <- glm(target ~ ., family=binomial, data = training_data)
summary(logreg1)
##
## Call:
## glm(formula = target ~ ., family = binomial, data = training_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.1033 0.7543 2.788 0.00530 **
## chas1 1.0266 1.0516 0.976 0.32900
## zn -1.6844 0.9806 -1.718 0.08583 .
## indus -0.2092 0.3931 -0.532 0.59449
## nox 5.6163 1.0983 5.114 3.16e-07 ***
## rm -0.4921 0.5748 -0.856 0.39194
## age 0.7629 0.5015 1.521 0.12821
## dis 1.5354 0.5560 2.761 0.00575 **
## rad 6.6575 1.6747 3.975 7.03e-05 ***
## tax -1.2196 0.5497 -2.219 0.02652 *
## ptratio 0.9799 0.3226 3.038 0.00238 **
## lstat 0.4185 0.4305 0.972 0.33100
## medv 1.8747 0.7477 2.507 0.01217 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 450.70 on 325 degrees of freedom
## Residual deviance: 136.83 on 313 degrees of freedom
## AIC: 162.83
##
## Number of Fisher Scoring iterations: 9
logreg_metrics <- function(cm){
TN <- cm[1,1]
TP <- cm[2,2]
FN <- cm[2,1]
FP <- cm[1,2]
acc = (TP + TN) / sum(cm)
er = (FP + FN) / sum(cm)
pre = TP / (TP + FP)
sens = TP / (TP + FN)
spec = TN / (TN + FP)
f1 = (2 * pre * sens) / (pre + sens)
metrics <- tibble(Metric = c("Accuracy", "Error_rate", "Precision",
"Sensitivity", "Specificity", "F1_score"),
Value = round(c(acc,er,pre,sens,spec,f1), 3))
metrics
}
# Generate predictions using testing data
pred1_probs <- predict(logreg1, testing_data, type = 'response')
# Get the predicted labels
pred1 <- ifelse(pred1_probs >= 0.5, 1, 0)
# COnstruct confusion matrix
cm1 <- table(actual = testing_data$target, prediction = pred1)
metrics1 <- logreg_metrics(cm1)
metrics1
## # A tibble: 6 × 2
## Metric Value
## <chr> <dbl>
## 1 Accuracy 0.886
## 2 Error_rate 0.114
## 3 Precision 1
## 4 Sensitivity 0.789
## 5 Specificity 1
## 6 F1_score 0.882
# Create the plot
ggplot(data = data.frame(cm1), aes(x = actual, y = prediction, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = 0.5, fontface = "bold") +
# Customize colors
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(title = "Confusion Matrix", x = "Actual", y = "Predicted") +
theme_minimal()
Improvements: Variable Selection and Varaince Inflation Factor
Several of the variables are highly correlated with each other. This multicollinearity may effect the true estimates of the impact each predictor has on correctly identifying if a households crime rate is above or below the median value. To identify variables with high multicollinearity we can assess their variance inflation factor (VIF).
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
# Write a function that takes a model, calculates VIF and plots it on a bar chart
plot_vif <- function(fitted_model, model_name){
# Calculate VIF
vif_values <- vif(fitted_model)
# Convert VIF results to a data frame for plotting
vif_df <- data.frame(Variable = names(vif_values), VIF = vif_values)
# Set a threshold to indicate high VIF
high_vif_threshold <- 5
# Generate bar plot
ggplot(vif_df, aes(Variable, VIF)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_hline(yintercept = high_vif_threshold, linetype = "dashed", color = "red") +
scale_y_continuous(limits = c(0, max(vif_df$VIF) + 1)) +
labs(title = paste0("Variance Inflation Factor (VIF) for Regression Model - ", model_name),
y = "VIF",
x = "Variable") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
plot_vif(logreg1, "Log Reg 1")
Since medv and rm have the highest VIF and room was not significant as a predictor lets try dropping that variable and check the results. Before fitting another model, lets also include an interaction effect between tax and rad since both of these variables are highly correlated with each other.
crime_train_data %>% ggplot(aes(x = rad, y = tax, color = target)) + geom_point() +
labs(title = "Tax vs. Rad") +
theme_minimal()
We
see again see some clear outliers in this plot, which was noted in both
the box plots and histograms of each variable.
logreg2 <- update(logreg1, .~. - rm + tax:rad)
summary(logreg2)
##
## Call:
## glm(formula = target ~ chas + zn + indus + nox + age + dis +
## rad + tax + ptratio + lstat + medv + rad:tax, family = binomial,
## data = training_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.6366 0.6107 2.680 0.00737 **
## chas1 0.9217 1.0870 0.848 0.39646
## zn -2.0505 0.9886 -2.074 0.03808 *
## indus -0.1009 0.3979 -0.254 0.79978
## nox 5.3795 1.0601 5.075 3.88e-07 ***
## age 0.5990 0.4487 1.335 0.18191
## dis 1.4057 0.5330 2.637 0.00836 **
## rad 5.9667 1.3779 4.330 1.49e-05 ***
## tax -2.2883 0.8486 -2.697 0.00700 **
## ptratio 0.8643 0.2867 3.015 0.00257 **
## lstat 0.5529 0.4038 1.369 0.17091
## medv 1.4174 0.4650 3.048 0.00230 **
## rad:tax -1.7275 0.9245 -1.869 0.06168 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 450.70 on 325 degrees of freedom
## Residual deviance: 136.05 on 313 degrees of freedom
## AIC: 162.05
##
## Number of Fisher Scoring iterations: 9
# Generate predictions using testing data
pred2_probs <- predict(logreg2, testing_data, type = 'response')
# Get the predicted labels
pred2 <- ifelse(pred2_probs >= 0.5, 1, 0)
# Construct confusion matrix
cm2 <- table(actual = testing_data$target, prediction = pred2)
# Calculate metrics
metrics2 <- logreg_metrics(cm2)
metrics2
## # A tibble: 6 × 2
## Metric Value
## <chr> <dbl>
## 1 Accuracy 0.9
## 2 Error_rate 0.1
## 3 Precision 1
## 4 Sensitivity 0.816
## 5 Specificity 1
## 6 F1_score 0.899
plot_vif(logreg2, "Log Reg 2")
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
Variable Selection: Subset Technique
Using the second model logreg2 to generate a subset of
predictors
library(ISLR2)
library(leaps)
log_subset <- regsubsets(logreg2$formula, data = training_data, nvmax = 12)
subset_summary <- summary(log_subset)
subset_summary
## Subset selection object
## Call: regsubsets.formula(logreg2$formula, data = training_data, nvmax = 12)
## 12 Variables (and intercept)
## Forced in Forced out
## chas1 FALSE FALSE
## zn FALSE FALSE
## indus FALSE FALSE
## nox FALSE FALSE
## age FALSE FALSE
## dis FALSE FALSE
## rad FALSE FALSE
## tax FALSE FALSE
## ptratio FALSE FALSE
## lstat FALSE FALSE
## medv FALSE FALSE
## rad:tax FALSE FALSE
## 1 subsets of each size up to 12
## Selection Algorithm: exhaustive
## chas1 zn indus nox age dis rad tax ptratio lstat medv rad:tax
## 1 ( 1 ) " " " " " " "*" " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " "*" " " " " "*" " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" "*" " " "*" " " " " " " " " " "
## 4 ( 1 ) " " " " " " "*" "*" " " "*" " " " " " " "*" " "
## 5 ( 1 ) " " " " " " "*" "*" " " "*" " " " " " " "*" "*"
## 6 ( 1 ) " " " " " " "*" "*" " " "*" "*" " " " " "*" "*"
## 7 ( 1 ) " " " " " " "*" "*" " " "*" "*" "*" " " "*" "*"
## 8 ( 1 ) " " " " " " "*" "*" " " "*" "*" "*" "*" "*" "*"
## 9 ( 1 ) " " " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) " " " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
subset_summary$bic
## [1] -237.8039 -257.6434 -269.2390 -273.6635 -273.3228 -273.0374 -270.8119
## [8] -267.3497 -262.9127 -257.7181 -251.9647 -246.1937
logreg3 <- update(logreg2, .~. - chas - zn - indus - dis - tax - ptratio - lstat - rad:tax)
summary(logreg3)
##
## Call:
## glm(formula = target ~ nox + age + rad + medv, family = binomial,
## data = training_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.7002 0.6000 2.833 0.004604 **
## nox 2.6793 0.5674 4.722 2.34e-06 ***
## age 0.5144 0.3559 1.445 0.148434
## rad 3.9611 1.0939 3.621 0.000293 ***
## medv 0.4070 0.2566 1.586 0.112678
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 450.70 on 325 degrees of freedom
## Residual deviance: 169.01 on 321 degrees of freedom
## AIC: 179.01
##
## Number of Fisher Scoring iterations: 8
# Generate predictions using testing data
pred3_probs <- predict(logreg3, testing_data, type = 'response')
# Get the predicted labels
pred3 <- ifelse(pred3_probs >= 0.5, 1, 0)
# Construct confusion matrix
cm3 <- table(actual = testing_data$target, prediction = pred3)
# Calculate metrics
metrics3 <- logreg_metrics(cm3)
metrics3
## # A tibble: 6 × 2
## Metric Value
## <chr> <dbl>
## 1 Accuracy 0.879
## 2 Error_rate 0.121
## 3 Precision 1
## 4 Sensitivity 0.776
## 5 Specificity 1
## 6 F1_score 0.874
Variable Selection: Backwards Elimination
backwards_logreg <- stats::step(logreg2, direction = "backward", k = log(nrow(training_data))) # USE BIC
## Start: AIC=211.28
## target ~ chas + zn + indus + nox + age + dis + rad + tax + ptratio +
## lstat + medv + rad:tax
##
## Df Deviance AIC
## - indus 1 136.11 205.55
## - chas 1 136.78 206.23
## - rad:tax 1 137.57 207.01
## - lstat 1 137.88 207.33
## - age 1 138.00 207.45
## <none> 136.05 211.28
## - zn 1 141.95 211.39
## - dis 1 143.75 213.19
## - ptratio 1 145.97 215.42
## - medv 1 146.91 216.35
## - nox 1 182.49 251.93
##
## Step: AIC=205.55
## target ~ chas + zn + nox + age + dis + rad + tax + ptratio +
## lstat + medv + rad:tax
##
## Df Deviance AIC
## - chas 1 136.79 200.44
## - rad:tax 1 137.78 201.44
## - lstat 1 137.91 201.57
## - age 1 138.15 201.80
## <none> 136.11 205.55
## - zn 1 142.22 205.88
## - dis 1 143.80 207.46
## - ptratio 1 145.99 209.64
## - medv 1 147.00 210.65
## - nox 1 190.08 253.74
##
## Step: AIC=200.44
## target ~ zn + nox + age + dis + rad + tax + ptratio + lstat +
## medv + rad:tax
##
## Df Deviance AIC
## - rad:tax 1 138.68 196.54
## - lstat 1 138.96 196.83
## - age 1 139.18 197.05
## <none> 136.79 200.44
## - zn 1 143.62 201.49
## - dis 1 144.25 202.12
## - ptratio 1 146.20 204.07
## - medv 1 148.19 206.06
## - nox 1 190.16 248.03
##
## Step: AIC=196.54
## target ~ zn + nox + age + dis + rad + tax + ptratio + lstat +
## medv
##
## Df Deviance AIC
## - lstat 1 140.74 192.82
## - age 1 141.03 193.11
## - zn 1 144.45 196.53
## <none> 138.68 196.54
## - dis 1 146.42 198.50
## - ptratio 1 148.02 200.10
## - tax 1 148.40 200.49
## - medv 1 150.09 202.17
## - rad 1 180.80 232.88
## - nox 1 191.58 243.66
##
## Step: AIC=192.82
## target ~ zn + nox + age + dis + rad + tax + ptratio + medv
##
## Df Deviance AIC
## - age 1 145.39 191.69
## - zn 1 146.01 192.31
## <none> 140.74 192.82
## - dis 1 148.77 195.07
## - ptratio 1 149.57 195.87
## - tax 1 149.80 196.10
## - medv 1 150.23 196.53
## - rad 1 182.87 229.17
## - nox 1 194.12 240.41
##
## Step: AIC=191.69
## target ~ zn + nox + dis + rad + tax + ptratio + medv
##
## Df Deviance AIC
## - dis 1 150.45 190.96
## - zn 1 150.53 191.03
## <none> 145.39 191.69
## - medv 1 151.66 192.17
## - ptratio 1 152.55 193.06
## - tax 1 154.81 195.32
## - rad 1 186.56 227.06
## - nox 1 215.84 256.35
##
## Step: AIC=190.96
## target ~ zn + nox + rad + tax + ptratio + medv
##
## Df Deviance AIC
## - zn 1 152.76 187.49
## - medv 1 153.88 188.60
## <none> 150.45 190.96
## - ptratio 1 157.22 191.94
## - tax 1 161.97 196.69
## - rad 1 194.91 229.63
## - nox 1 245.19 279.91
##
## Step: AIC=187.49
## target ~ nox + rad + tax + ptratio + medv
##
## Df Deviance AIC
## - medv 1 155.64 184.58
## <none> 152.76 187.49
## - ptratio 1 160.99 189.92
## - tax 1 164.36 193.30
## - rad 1 197.13 226.07
## - nox 1 288.81 317.75
##
## Step: AIC=184.58
## target ~ nox + rad + tax + ptratio
##
## Df Deviance AIC
## - ptratio 1 161.15 184.30
## <none> 155.64 184.58
## - tax 1 170.66 193.81
## - rad 1 206.16 229.30
## - nox 1 288.82 311.97
##
## Step: AIC=184.3
## target ~ nox + rad + tax
##
## Df Deviance AIC
## <none> 161.15 184.30
## - tax 1 172.97 190.33
## - rad 1 206.66 224.02
## - nox 1 289.13 306.50
backwards_logreg$anova
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 NA NA 313 136.0460 211.2757
## 2 - indus 1 0.06478595 314 136.1108 205.5535
## 3 - chas 1 0.67779324 315 136.7886 200.4444
## 4 - rad:tax 1 1.88694734 316 138.6755 196.5445
## 5 - lstat 1 2.06371065 317 140.7392 192.8213
## 6 - age 1 4.65344063 318 145.3927 191.6878
## 7 - dis 1 5.05852459 319 150.4512 190.9595
## 8 - zn 1 2.31281488 320 152.7640 187.4854
## 9 - medv 1 2.87792043 321 155.6419 184.5764
## 10 - ptratio 1 5.50989614 322 161.1518 184.2994
backwards_logreg$formula
## target ~ nox + rad + tax
Fit a new regression with the resulting predictors from backwards
elimination. We can update the logreg2 to only include only
those predictors.
logreg4 <- glm(target ~ nox + rad + tax, family = "binomial", data = training_data)
summary(logreg4)
##
## Call:
## glm(formula = target ~ nox + rad + tax, family = "binomial",
## data = training_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.2139 0.5861 3.777 0.000159 ***
## nox 4.1979 0.6162 6.812 9.62e-12 ***
## rad 5.5489 1.1788 4.707 2.51e-06 ***
## tax -1.3675 0.4462 -3.064 0.002180 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 450.70 on 325 degrees of freedom
## Residual deviance: 161.15 on 322 degrees of freedom
## AIC: 169.15
##
## Number of Fisher Scoring iterations: 8
BIC(logreg4)
## [1] 184.2994
# Generate predictions using testing data
pred4_probs <- predict(logreg4, testing_data, type = 'response')
# Get the predicted labels
pred4 <- ifelse(pred4_probs >= 0.5, 1, 0)
# Construct confusion matrix
cm4 <- table(actual = testing_data$target, prediction = pred4)
# Calculate metrics
metrics4 <- logreg_metrics(cm4)
metrics4
## # A tibble: 6 × 2
## Metric Value
## <chr> <dbl>
## 1 Accuracy 0.871
## 2 Error_rate 0.129
## 3 Precision 1
## 4 Sensitivity 0.763
## 5 Specificity 1
## 6 F1_score 0.866
anova(logreg2,logreg4)
## Analysis of Deviance Table
##
## Model 1: target ~ chas + zn + indus + nox + age + dis + rad + tax + ptratio +
## lstat + medv + rad:tax
## Model 2: target ~ nox + rad + tax
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 313 136.05
## 2 322 161.15 -9 -25.106 0.002856 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#left_join(logreg_metrics(confusion_matrix4), logreg_metrics(confusion_matrix3), by = "Metric", suffix = c("4","3"))
Final model is the logreg3 since it is a simpler model. Despite having a higher residual deviance score when compared to logreg4, it performed exactly the same on unseen data (testing_data).
# Create the plot
ggplot(data = data.frame(cm2), aes(x = actual, y = prediction, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = 0.5, fontface = "bold") +
# Customize colors
scale_fill_gradient(low = "red", high = "lightblue") +
labs(title = "Confusion Matrix: LogReg2", x = "Actual", y = "Predicted") +
theme_minimal()
tibble(Metrics = c("Accuracy", "Error Rate", "Precision","Sensitivity", "Specificity", "Sensitivity", "F1_score", "Resid. Deviance"),
)
## # A tibble: 8 × 1
## Metrics
## <chr>
## 1 Accuracy
## 2 Error Rate
## 3 Precision
## 4 Sensitivity
## 5 Specificity
## 6 Sensitivity
## 7 F1_score
## 8 Resid. Deviance
df12 <- merge(metrics1, metrics2, by = "Metric", suffixes = c("1", "2"))
df34 <- merge(metrics3,metrics4, by = "Metric", suffixes = c("3", "4"))
summary_df <- left_join(df12, df34, by = "Metric")
colnames(summary_df) <- c("Metric", "logreg1", "logreg2", "logreg3", "logreg4")
summary_df
## Metric logreg1 logreg2 logreg3 logreg4
## 1 Accuracy 0.886 0.900 0.879 0.871
## 2 Error_rate 0.114 0.100 0.121 0.129
## 3 F1_score 0.882 0.899 0.874 0.866
## 4 Precision 1.000 1.000 1.000 1.000
## 5 Sensitivity 0.789 0.816 0.776 0.763
## 6 Specificity 1.000 1.000 1.000 1.000
deviance <- c("Deviance", round(logreg1$deviance,2), round(logreg2$deviance,2), round(logreg3$deviance,2), round(logreg4$deviance,2))
full_summary_df <- rbind(summary_df, deviance)
full_summary_df
## Metric logreg1 logreg2 logreg3 logreg4
## 1 Accuracy 0.886 0.9 0.879 0.871
## 2 Error_rate 0.114 0.1 0.121 0.129
## 3 F1_score 0.882 0.899 0.874 0.866
## 4 Precision 1 1 1 1
## 5 Sensitivity 0.789 0.816 0.776 0.763
## 6 Specificity 1 1 1 1
## 7 Deviance 136.83 136.05 169.01 161.15
write.csv(full_summary_df, "summary_df.csv", row.names = FALSE)
# Center and scale the data
head(crime_eval_data)
## # A tibble: 6 × 12
## zn indus chas nox rm age dis rad tax ptratio lstat medv
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 7.07 0 0.469 7.18 61.1 4.97 2 242 17.8 4.03 34.7
## 2 0 8.14 0 0.538 6.10 84.5 4.46 4 307 21 10.3 18.2
## 3 0 8.14 0 0.538 6.50 94.4 4.45 4 307 21 12.8 18.4
## 4 0 8.14 0 0.538 5.95 82 3.99 4 307 21 27.7 13.2
## 5 0 5.96 0 0.499 5.85 41.5 3.93 5 279 19.2 8.77 21
## 6 25 5.13 0 0.453 5.74 66.2 7.23 8 284 19.7 13.2 18.7
# convert variables factor type
crime_eval_data$chas <- as_factor(crime_eval_data$chas)
scaled_features <- scale(crime_eval_data[-3])
eval_data_prepared <- cbind(scaled_features, crime_eval_data[3])
eval_data_prepared <- eval_data_prepared %>% dplyr::select(c(chas, zn, indus, nox, age, dis, rad, tax, ptratio, lstat, medv))
# Generate predictions using testing data
pred_eval_probs <- predict(logreg2, eval_data_prepared, type = 'response')
# Get the predicted labels
pred_eval <- ifelse(pred_eval_probs >= 0.5, 1, 0)
crime_eval_data$pred_probs <- pred_eval_probs
crime_eval_data$pred_labels <- pred_eval
write.csv(crime_eval_data, "crime_eval_data_predictions.csv", row.names = FALSE)