# Analyze incarceration data


# Load R environment ---------

renv::activate()


# Load packages ---------

library(here)
## here() starts at /users/akhann16/code/cadre/data-analysis-plotting/Simulated-Data-Analysis/r/agent-analysis
library(data.table)
library(yaml)
library(ggplot2)


# Utility functions ------------
#source(here("agent-analysis", "utils", "post-release-smoking.R"))
source("/users/akhann16/code/cadre/data-analysis-plotting/Simulated-Data-Analysis/r/agent-analysis/utils/post-release-smoking.R")


# Read RDS file ------------

#agent_log_env <- readRDS(here("agent-analysis", "rds-outs", "agent_log_env.RDS"))
agent_log_env <- readRDS("/users/akhann16/code/cadre/data-analysis-plotting/Simulated-Data-Analysis/r/agent-analysis/rds-outs/agent_log_env.RDS")


# Load data ------------

agent_dt <- agent_log_env[["agent_dt"]]
input_params <- agent_log_env[["input_params"]]


# Tick variables ------------

last_tick <- max(agent_dt$tick)
selected_ticks <- c(seq(1, last_tick, by = 10), last_tick)
last_tick_data <- agent_dt[tick==last_tick,]


# Smoking ratio at last tick of current smoking to all persons ----------

smoking_ratio <- 
  last_tick_data[, .(
    n_current_smokers = sum(smoking_status == "Current"),
    n_total_persons = sum(smoking_status %in% c("Current", "Former", "Never"))
  ), by = .(group = n_releases >= 1)]

# Compute the ratio of current smokers to total smokers in each group

smoking_ratio
##    group n_current_smokers n_total_persons
## 1: FALSE              1252            9604
## 2:  TRUE               133             396
smoking_ratio[, ratio := n_current_smokers / n_total_persons][]
##    group n_current_smokers n_total_persons     ratio
## 1: FALSE              1252            9604 0.1303623
## 2:  TRUE               133             396 0.3358586
# Compute change in current smoking over time since release ----------

## computation
time_periods <- c(1, 7, 14, 30, 90, 180, 365)
labels <- c("1D", "1W", "2W", "1M", "3M", "6M", "1Y")
summaries_smoking <- lapply(time_periods, function(x) summary_after_release(x))

summaries_smoking
## [[1]]
## [[1]]$proportions
##   Category Proportion
## 1  Current  0.7310345
## 2   Former  0.1103448
## 3    Never  0.1586207
## 
## [[1]]$agent_count
## [1] 145
## 
## 
## [[2]]
## [[2]]$proportions
##   Category Proportion
## 1  Current 0.80916031
## 2   Former 0.06965649
## 3    Never 0.12118321
## 
## [[2]]$agent_count
## [1] 1048
## 
## 
## [[3]]
## [[3]]$proportions
##   Category Proportion
## 1  Current 0.80037665
## 2   Former 0.07532957
## 3    Never 0.12429379
## 
## [[3]]$agent_count
## [1] 2124
## 
## 
## [[4]]
## [[4]]$proportions
##   Category Proportion
## 1  Current  0.7882743
## 2   Former  0.0869469
## 3    Never  0.1247788
## 
## [[4]]$agent_count
## [1] 4520
## 
## 
## [[5]]
## [[5]]$proportions
##   Category Proportion
## 1  Current  0.7375224
## 2   Former  0.1306548
## 3    Never  0.1318228
## 
## [[5]]$agent_count
## [1] 12843
## 
## 
## [[6]]
## [[6]]$proportions
##   Category Proportion
## 1  Current  0.6717852
## 2   Former  0.1871198
## 3    Never  0.1410950
## 
## [[6]]$agent_count
## [1] 23835
## 
## 
## [[7]]
## [[7]]$proportions
##   Category Proportion
## 1  Current  0.5891743
## 2   Former  0.2515474
## 3    Never  0.1592783
## 
## [[7]]$agent_count
## [1] 41845
# Visualization (smoking in released persons) ----------

current_smoker_proportions <- sapply(
  summaries_smoking, 
  function(x) {
    if ("Current" %in% x$proportions$Category){
      return(x$proportions[x$proportions$Category == "Current", "Proportion"])
    }
    else {
      return(0)
    }
  }
)

current_smoker_df <- data.frame(Time = time_periods,
                                Labels = labels,
                                Proportion = current_smoker_proportions)

p <- 
  ggplot(current_smoker_df, aes(x = Time, y = Proportion, group = 1)) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = time_periods, labels = labels) +
  scale_y_continuous(breaks = seq(0, 1, 0.2), limits = c(0, 1)) +
  labs(title = "",
       x = "Time After Release",
       y = "Proportion of Current Smoking") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))


# Visualization (smoking in general population) ----------

# calculate the proportion of current, former, and never smokers at the specified time ticks
smokers_by_tick <- agent_dt[tick %in% selected_ticks, .(
  current_smokers = sum(smoking_status == "Current") / .N,
  former_smokers = sum(smoking_status == "Former") / .N,
  never_smokers = sum(smoking_status == "Never") / .N
), by = tick]

# mean from the the last last 10 rows
mean_current_smoker <- mean(tail(smokers_by_tick$current_smokers, 10))

# create a new data frame for the general population
general_population <- data.frame(
  Time = current_smoker_df$Time,
  Proportion = rep(mean_current_smoker, length(current_smoker_df$Time))
)

# create the plot
p <- ggplot() +
  geom_line(data = current_smoker_df, aes(x = Time, y = Proportion, color = "Previously Incarcerated"), linewidth=1.5) +
  geom_line(data = general_population, aes(x = Time, y = Proportion, color = "Overall"), linewidth=1.5) +
  scale_color_manual(values = c("Previously Incarcerated" = "blue", "Overall" = "red")) +
  scale_x_continuous(breaks = time_periods, labels = labels) +
  scale_y_continuous(breaks = seq(0, 1, 0.2), limits = c(0, 1)) +
  labs(
    title = "",
    x = "Time After Release",
    y = "",
    color = "Current Smoking Rate"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        text = element_text(size = 24, face = "bold"))
p