library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.2.3
library(ggrepel)
library(broom)
library(lindia)
## Warning: package 'lindia' was built under R version 4.2.3
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(readxl)
htd <- read.csv("htd.csv")
htd <- htd |>
mutate(offense_name_binary = ifelse(OFFENSE_SUBCAT_NAME == "Commercial Sex Acts", 1,0))
model1 <- glm(offense_name_binary ~ ACTUAL_COUNT, data = htd,
family = binomial(link = 'logit'))
model1$coefficients
## (Intercept) ACTUAL_COUNT
## 0.9568179446 0.0005005197
The intercept of approximately 0.957 represents the estimated
log-odds of the binary outcome offense_name_binary being 1
(or true) when ACTUAL_COUNT is 0. In practical terms, this
suggests that when there are no instances of ACTUAL_COUNT,
the model predicts a likelihood of around 72.6% for
offense_name_binary to be 1. The coefficient for
ACTUAL_COUNT (0.0005) signifies the change in the log-odds
of offense_name_binary being 1 for every one-unit increase
in ACTUAL_COUNT. While the effect seems small, it implies
that as ACTUAL_COUNT increases by one unit, the log-odds of
offense_name_binary being 1 rises by approximately 0.0005.
This indicates a subtle but positive association between
ACTUAL_COUNT and the likelihood of
offense_name_binary being 1, implying a slight increase in
the probability of the binary outcome with each additional count in
ACTUAL_COUNT.
sigmoid1 <- \(x) 1 / (1 + exp((0.9568179446 + 0.0005005197 * x)))
htd |>
ggplot(mapping = aes(x = ACTUAL_COUNT, y = offense_name_binary)) +
geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3) +
geom_function(fun = sigmoid1, color = 'blue', linewidth = 1) +
labs(title = "Modeling a Binary Response with Sigmoid") +
scale_y_continuous(breaks = c(0, 1)) +
theme_minimal()
## Warning: Ignoring unknown parameters: linewidth
The curve starts at a low probability, increases steeply, and then
levels off at a high probability. This means that as the count of human
trafficking cases increases, the probability of an offense name being
involuntary servitude (0) also increases. However, the probability does
not increase linearly, but rather in a sigmoid shape. This suggests that
there is a threshold value of the count where the probability changes
rapidly from low to high, and then levels off at a maximum value.
TRANSFORMATION OF ACTUAL_COUNT
htd |>
ggplot(mapping = aes(x = ACTUAL_COUNT)) +
geom_histogram(color = 'white') +
labs("Histogram of Actual count cases") +
theme_hc()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
htd <- htd |>
mutate(log_actual_count = log(ACTUAL_COUNT + 1))
## Warning in log(ACTUAL_COUNT + 1): NaNs produced
htd |>
ggplot(mapping = aes(x = JUVENILE_CLEARED_COUNT, y = log_actual_count)) +
geom_point(shape = 'O', size = 3) +
geom_smooth(se = FALSE) +
labs(title = "Log(actual_count) vs. juvenile_cleared_count") +
theme_minimal()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
htd <- htd |>
mutate(squared_JUVENILE_CLEARED_COUNT = JUVENILE_CLEARED_COUNT^2)
htd |>
ggplot(mapping = aes(x = squared_JUVENILE_CLEARED_COUNT, y = log_actual_count)) +
geom_point(shape = 'O', size = 3) +
geom_smooth(se = FALSE) +
labs(title = "Log(actual_count) vs. squared(Juvenile Cleared Count)") +
theme_minimal()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
model <- glm(squared_JUVENILE_CLEARED_COUNT ~ log_actual_count, data = htd,
family = poisson(link = 'log'))
model$coefficients
## (Intercept) log_actual_count
## 5.9958317 0.4657834
inv_log <- \(x) exp(5.996 + 0.466 * x)
htd |>
ggplot(mapping = aes(x = log_actual_count, y = squared_JUVENILE_CLEARED_COUNT)) +
geom_point() +
geom_function(fun = inv_log, color = 'blue', linewidth = 1) +
labs(title = "Modeling Price with the Poisson Model") +
theme_minimal()
## Warning: Ignoring unknown parameters: linewidth
## Warning: Removed 1 rows containing missing values (geom_point).
The plot, derived from transforming
ACTUAL_COUNT, suggests
that no further transformation is necessary for the
ACTUAL_COUNT explanatory variable. The vertical alignment
of data points within a specific range of
squared_JUVENILE_CLEARED_COUNT indicates minimal
variability in ACTUAL_COUNT values, implying stability in
this segment. As squared_JUVENILE_CLEARED_COUNT increases,
the plot shows a horizontal extension, suggesting a non-linear
relationship where higher values of the juvenile cleared count
exponentially influence the actual count. However, despite this
non-linearity, the overall shape of the plot does not indicate a need
for additional transformations on the ACTUAL_COUNT
variable. Therefore, the analysis suggests that the original
ACTUAL_COUNT variable is appropriate for modeling.
model2 <- glm(offense_name_binary ~ JUVENILE_CLEARED_COUNT, data = htd,
family = binomial(link = 'logit'))
model2$coefficients
## (Intercept) JUVENILE_CLEARED_COUNT
## 1.063906842 -0.001954318
sigmoid2 <- \(x) 1 / (1 + exp((1.063906842 + (-0.001954318 * x))))
htd |>
ggplot(mapping = aes(x = JUVENILE_CLEARED_COUNT, y = offense_name_binary)) +
geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3) +
geom_function(fun = sigmoid2, color = 'blue', linewidth = 1) +
labs(title = "Modeling a Binary Response with Sigmoid") +
scale_y_continuous(breaks = c(0, 1)) +
theme_minimal()
## Warning: Ignoring unknown parameters: linewidth
The plot illustrates a clear trend: at lower values of
juvenile_cleared_count, the probability of cases being
classified as “involuntary servitude” is low. However, as
juvenile_cleared_count increases, the likelihood of cases
being identified as “commercial sex acts” rises significantly. This
shift indicates that higher counts are strongly associated with cases
falling into the category of “commercial sex acts.” The sigmoid curve
visually captures this transition, offering a concise representation of
how the model’s predictions change as the actual counts vary,
emphasizing the predictive power of juvenile_cleared_count
in distinguishing between the two categories.