library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.2.3
library(ggrepel)
library(broom)
library(lindia)
## Warning: package 'lindia' was built under R version 4.2.3
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(readxl)
htd <- read.csv("htd.csv")
htd <- htd |>
  mutate(offense_name_binary = ifelse(OFFENSE_SUBCAT_NAME == "Commercial Sex Acts", 1,0))
model1 <- glm(offense_name_binary ~ ACTUAL_COUNT, data = htd,
             family = binomial(link = 'logit'))

model1$coefficients
##  (Intercept) ACTUAL_COUNT 
## 0.9568179446 0.0005005197

The intercept of approximately 0.957 represents the estimated log-odds of the binary outcome offense_name_binary being 1 (or true) when ACTUAL_COUNT is 0. In practical terms, this suggests that when there are no instances of ACTUAL_COUNT, the model predicts a likelihood of around 72.6% for offense_name_binary to be 1. The coefficient for ACTUAL_COUNT (0.0005) signifies the change in the log-odds of offense_name_binary being 1 for every one-unit increase in ACTUAL_COUNT. While the effect seems small, it implies that as ACTUAL_COUNT increases by one unit, the log-odds of offense_name_binary being 1 rises by approximately 0.0005. This indicates a subtle but positive association between ACTUAL_COUNT and the likelihood of offense_name_binary being 1, implying a slight increase in the probability of the binary outcome with each additional count in ACTUAL_COUNT.

sigmoid1 <- \(x) 1 / (1 + exp((0.9568179446 + 0.0005005197  * x)))

htd |>
  ggplot(mapping = aes(x = ACTUAL_COUNT, y = offense_name_binary)) +
  geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3) +
  geom_function(fun = sigmoid1, color = 'blue', linewidth = 1) +
  labs(title = "Modeling a Binary Response with Sigmoid") +
  scale_y_continuous(breaks = c(0, 1)) +
  theme_minimal()
## Warning: Ignoring unknown parameters: linewidth

The curve starts at a low probability, increases steeply, and then levels off at a high probability. This means that as the count of human trafficking cases increases, the probability of an offense name being involuntary servitude (0) also increases. However, the probability does not increase linearly, but rather in a sigmoid shape. This suggests that there is a threshold value of the count where the probability changes rapidly from low to high, and then levels off at a maximum value.

TRANSFORMATION OF ACTUAL_COUNT

htd |>
  ggplot(mapping = aes(x = ACTUAL_COUNT)) +
  geom_histogram(color = 'white') +
  labs("Histogram of Actual count cases") +
  theme_hc()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

htd <- htd |>
  mutate(log_actual_count = log(ACTUAL_COUNT + 1))
## Warning in log(ACTUAL_COUNT + 1): NaNs produced
htd |>
  ggplot(mapping = aes(x = JUVENILE_CLEARED_COUNT, y = log_actual_count)) +
  geom_point(shape = 'O', size = 3) +
  geom_smooth(se = FALSE) +
  labs(title = "Log(actual_count) vs. juvenile_cleared_count") +
  theme_minimal()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

htd <- htd |>
  mutate(squared_JUVENILE_CLEARED_COUNT = JUVENILE_CLEARED_COUNT^2)

htd |>
  ggplot(mapping = aes(x = squared_JUVENILE_CLEARED_COUNT, y = log_actual_count)) +
  geom_point(shape = 'O', size = 3) +
  geom_smooth(se = FALSE) +
  labs(title = "Log(actual_count) vs. squared(Juvenile Cleared Count)") +
  theme_minimal()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

model <- glm(squared_JUVENILE_CLEARED_COUNT ~ log_actual_count, data = htd, 
             family = poisson(link = 'log'))

model$coefficients
##      (Intercept) log_actual_count 
##        5.9958317        0.4657834
inv_log <- \(x) exp(5.996 + 0.466 * x)

htd |>
  ggplot(mapping = aes(x = log_actual_count, y = squared_JUVENILE_CLEARED_COUNT)) +
  geom_point() +
  geom_function(fun = inv_log, color = 'blue', linewidth = 1) +
  labs(title = "Modeling Price with the Poisson Model") +
  theme_minimal()
## Warning: Ignoring unknown parameters: linewidth
## Warning: Removed 1 rows containing missing values (geom_point).

The plot, derived from transforming ACTUAL_COUNT, suggests that no further transformation is necessary for the ACTUAL_COUNT explanatory variable. The vertical alignment of data points within a specific range of squared_JUVENILE_CLEARED_COUNT indicates minimal variability in ACTUAL_COUNT values, implying stability in this segment. As squared_JUVENILE_CLEARED_COUNT increases, the plot shows a horizontal extension, suggesting a non-linear relationship where higher values of the juvenile cleared count exponentially influence the actual count. However, despite this non-linearity, the overall shape of the plot does not indicate a need for additional transformations on the ACTUAL_COUNT variable. Therefore, the analysis suggests that the original ACTUAL_COUNT variable is appropriate for modeling.

model2 <- glm(offense_name_binary ~ JUVENILE_CLEARED_COUNT, data = htd,
             family = binomial(link = 'logit'))

model2$coefficients
##            (Intercept) JUVENILE_CLEARED_COUNT 
##            1.063906842           -0.001954318
sigmoid2 <- \(x) 1 / (1 + exp((1.063906842 + (-0.001954318   * x))))
htd |>
  ggplot(mapping = aes(x = JUVENILE_CLEARED_COUNT, y = offense_name_binary)) +
  geom_jitter(width = 0, height = 0.1, shape = 'O', size = 3) +
  geom_function(fun = sigmoid2, color = 'blue', linewidth = 1) +
  labs(title = "Modeling a Binary Response with Sigmoid") +
  scale_y_continuous(breaks = c(0, 1)) +
  theme_minimal()
## Warning: Ignoring unknown parameters: linewidth

The plot illustrates a clear trend: at lower values of juvenile_cleared_count, the probability of cases being classified as “involuntary servitude” is low. However, as juvenile_cleared_count increases, the likelihood of cases being identified as “commercial sex acts” rises significantly. This shift indicates that higher counts are strongly associated with cases falling into the category of “commercial sex acts.” The sigmoid curve visually captures this transition, offering a concise representation of how the model’s predictions change as the actual counts vary, emphasizing the predictive power of juvenile_cleared_count in distinguishing between the two categories.