# Analyze incarceration data (This is from the agent log, there is a separate incarceartion log)
rm(list=ls())


# Load R environment ---------

renv::activate()


# Load packages ---------

library(here)
## here() starts at /users/akhann16/code/cadre/data-analysis-plotting/Simulated-Data-Analysis/r/agent-log-analysis
library(data.table)
library(yaml)
library(ggplot2)


# Read RDS file ------------

#agent_log_env <- readRDS(here("agent-analysis", "rds-outs", "agent_log_env.RDS"))
agent_log_env <- readRDS("/users/akhann16/code/cadre/data-analysis-plotting/Simulated-Data-Analysis/r/agent-log-analysis/rds-outs/agent_log_env.RDS")


# Load data ------------

agent_dt <- agent_log_env[["agent_dt"]]
input_params <- agent_log_env[["input_params"]]


# Summary ------------

last_tick <- max(agent_dt$tick)
selected_ticks <- c(seq(1, last_tick, by = 10), last_tick)

# mean incarceration rate over time
incarceration_summary <- agent_dt[tick %in% selected_ticks, 
                                  .(n_incarcerated = sum(current_incarceration_status == 1),
                                    n_agents = .N), 
                                  by = tick]

incarceration_summary[, rate_per_100k := n_incarcerated / n_agents * 100000]

ggplot(incarceration_summary, aes(x = tick, y = rate_per_100k)) +
  geom_line() +
  theme_minimal() +
  labs(title = "",
       x = "Time (Days)",
       y = "Incarceration Rate (per 100,000 persons)")+
  #theme(text = element_text(size = 20, face = "bold"))+
  theme(text = element_text(face = "bold"))+
  ylim(c(0,500))

# number incarcerated
ggplot(incarceration_summary, aes(x = tick, y = n_incarcerated)) +
  geom_line() +
  theme_minimal() +
  labs(title = "",
       x = "Time (Days)",
       y = "Number Incarcerated")

# Last tick analysis ----------

# number and proportion of incarcerated persons at last tick:
incarceration_summary <- agent_dt[tick %in% selected_ticks, 
                                  .(n_incarcerated = sum(current_incarceration_status == 1),
                                    n_agents = .N), 
                                  by = tick]
tail(incarceration_summary)
##     tick n_incarcerated n_agents
## 1: 10901             18    10001
## 2: 10911             20    10000
## 3: 10921             20    10001
## 4: 10931             18    10003
## 5: 10941             15    10002
## 6: 10950             17    10001
# race distribution
agent_dt[tick==last_tick & current_incarceration_status == 1, 
         .N, 
         by=c("race")][,"prop":=round(N/sum(N)*100, 0)][]
##        race  N prop
## 1:    Black 12   71
## 2:    White  4   24
## 3: Hispanic  1    6
# sex distribution
agent_dt[tick==last_tick & current_incarceration_status == 1, 
         .N, 
         by=c("female")][,"prop":=round(N/sum(N)*100, 0)][]
##    female  N prop
## 1:      0 17  100
# race-sex distribution
agent_dt[tick==last_tick & current_incarceration_status == 1, 
         .N, 
         by=c("race", "female")][,"prop":=round(N/sum(N)*100, 0)][]
##        race female  N prop
## 1:    Black      0 12   71
## 2:    White      0  4   24
## 3: Hispanic      0  1    6
# Ever incarcerated ----------

# how many?

# To answer this question, we can group the last incarceration times by agent ID.
# If last incarceration time is -1, the person has never been incarcerated. 
# If last incarceration time is > -1, the person has been incarcerated at least once. 

last_incarceration_time <- agent_dt[, 
                                    .(last_incarceration_time = max(last_incarceration_tick)), 
                                    by = id]

n_ever_incarcerated <- sum(last_incarceration_time$last_incarceration_time != -1)

prop_ever_incarcerated <- n_ever_incarcerated / nrow(last_incarceration_time)


# agent-level analysis of those ever incarcerated 

ever_incarcerated_times <- last_incarceration_time[last_incarceration_time > 1]
dim(ever_incarcerated_times)
## [1] 573   2
## demographics

### generate dataset
ever_incarcerated_ids <-
  unique(agent_dt[id %in% ever_incarcerated_times$id, id])
ever_incarcerated_info <- agent_dt[id %in% ever_incarcerated_ids,
                                   .SD[.N, list(
                                     id,
                                     age = last(age),
                                     race = last(race),
                                     female = last(female)
                                   )],
                                   by = id]


### create distributions

ever_incarcerated_info[,#age 
                       .(
  mean_age = mean(age),
  min_age = min(age),
  max_age = max(age)
)]
##    mean_age min_age max_age
## 1: 70.26003      28      84
sex_incarcerated_proportion <- 
  ever_incarcerated_info[, .(
    Count = .N,
    Proportion = round(.N / nrow(ever_incarcerated_info), 2)
  ), 
  by = .(Sex = ifelse(female==1, "Female", "Male"))
  ]

race_incarcerated_proportion <- 
  ever_incarcerated_info[, .(
  Count = .N,
  Proportion = round(.N / nrow(ever_incarcerated_info), 2)
  ), 
  by = .(Race=race)]

race_sex_incarcerated_proportion <- 
  ever_incarcerated_info[, .N, by = c("race", "female")][
    #race, sex
    ,
    .(Race = race,
      Sex = ifelse(female == 1, "Female", "Male"),
      Count = N,
      Proportion = round(N / sum(N), 2)
    )[order(c(female, race))]
  ]


# Race/Sex distribution in full population ----------

## defined as  % group representation in incarcerated population
## relative to to representation in general population

unique_agents <- unique(agent_dt, by = "id")
dim(unique_agents)
## [1] 15735    16
str(unique_agents)
## Classes 'data.table' and 'data.frame':   15735 obs. of  16 variables:
##  $ tick                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ id                          : int  2610 7868 0 1 2 3 4 5 6 7 ...
##  $ age                         : int  84 84 83 83 41 31 23 48 77 21 ...
##  $ race                        : chr  "Asian" "White" "Hispanic" "Hispanic" ...
##  $ female                      : int  1 1 1 0 1 0 1 0 1 0 ...
##  $ alc_use_status              : int  0 3 1 3 0 0 0 1 0 0 ...
##  $ smoking_status              : chr  "Never" "Former" "Former" "Former" ...
##  $ last_incarceration_tick     : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ last_release_tick           : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ current_incarceration_status: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ entry_at_tick               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ exit_at_tick                : int  1 1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ n_incarcerations            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ n_releases                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ n_smkg_stat_trans           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ n_alc_use_stat_trans        : int  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
race_population_proportion <- 
  unique_agents[, .(
                Count = .N,
                Proportion = round(.N/nrow(unique_agents), 2)
                ),
              by = .(Race = race)
              ]

sex_population_proportion <-
  unique_agents[, .(
  Count = .N,
  Proportion = round(.N/nrow(unique_agents), 2)
  ),
  by = .(Sex = ifelse(female==1, "Female", "Male"))
  ]

race_sex_population_proportion <- 
  unique_agents[, .N, by = c("race", "female")][
    #race, sex
    ,
    .(Race = race,
      Sex = ifelse(female == 1, "Female", "Male"),
      Count = N,
      Proportion = round(N / sum(N), 2)
    )[order(c(female, race))]
  ]



# Disparity analysis ---------

## race-sex combined
disparity_analysis <- merge(
  race_sex_incarcerated_proportion,
  race_sex_population_proportion,
  by = c("Race", "Sex"),
  suffixes = c("_Incarcerated", "_Population")
)


disparity_analysis[, Disparity_Ratio := round(Proportion_Incarcerated / Proportion_Population, 2)]
disparity_analysis <- disparity_analysis[order(-Disparity_Ratio)]
disparity_analysis
##        Race    Sex Count_Incarcerated Proportion_Incarcerated
## 1:    Black   Male                152                    0.27
## 2: Hispanic   Male                115                    0.20
## 3:    White   Male                257                    0.45
## 4:    Black Female                 17                    0.03
## 5:    Asian   Male                  5                    0.01
## 6: Hispanic Female                  6                    0.01
## 7:    White Female                 20                    0.03
## 8:    Asian Female                  1                    0.00
##    Proportion_Population Count_Population Disparity_Ratio
## 1:                  0.04              660            6.75
## 2:                  0.08             1225            2.50
## 3:                  0.35             5466            1.29
## 4:                  0.04              675            0.75
## 5:                  0.02              305            0.50
## 6:                  0.08             1290            0.12
## 7:                  0.37             5795            0.08
## 8:                  0.02              319            0.00
## race only
disparity_analysis_race <- merge(
  race_incarcerated_proportion,
  race_population_proportion,
  by = c("Race"),
  suffixes = c("_Incarcerated", "_Population")
)


disparity_analysis_race[, Disparity_Ratio := round(Proportion_Incarcerated / Proportion_Population, 2)]
disparity_analysis_race[order(-Disparity_Ratio)]
##        Race Count_Incarcerated Proportion_Incarcerated Count_Population
## 1:    Black                169                    0.29             1335
## 2: Hispanic                121                    0.21             2515
## 3:    White                277                    0.48            11261
## 4:    Asian                  6                    0.01              624
##    Proportion_Population Disparity_Ratio
## 1:                  0.08            3.62
## 2:                  0.16            1.31
## 3:                  0.72            0.67
## 4:                  0.04            0.25
## sex only
disparity_analysis_sex <- merge(
  sex_incarcerated_proportion,
  sex_population_proportion,
  by = c("Sex"),
  suffixes = c("_Incarcerated", "_Population")
)


disparity_analysis_sex[, Disparity_Ratio := round(Proportion_Population / Proportion_Incarcerated, 2)]
disparity_analysis_sex[order(-Disparity_Ratio)]
##       Sex Count_Incarcerated Proportion_Incarcerated Count_Population
## 1: Female                 44                    0.08             8079
## 2:   Male                529                    0.92             7656
##    Proportion_Population Disparity_Ratio
## 1:                  0.51            6.38
## 2:                  0.49            0.53
# Visualizing incarceration disparity by sex and race -----------

# sex
sex_data <- data.frame(
  Sex = rep(c("Female", "Male"), each = 2),
  Count = c(43, 8080, 527, 7622),
  Proportion = c(0.08, 0.51, 0.92, 0.49),
  PopulationType = rep(c("Incarcerated", "General"), 2)
)

ggplot(sex_data, aes(x = Sex, y = Proportion, fill = PopulationType)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  scale_fill_manual(values = c("General" = "blue", "Incarcerated" = "red")) +
  scale_y_continuous(breaks = seq(0, 1, 0.25), limits = c(0, 1))+
  labs(title = "",
       x = "Sex",
       y = "Proportion",
       fill = "Population Type") +
  theme_minimal()+
  theme(text=element_text(face = "bold", size = 20))

# race 
race_data <- data.frame(
  Race = rep(c("Black", "Hispanic", "White", "Asian"), each = 2),
  Count = c(188, 1310, 123, 2580, 254, 11214, 5, 598),
  Proportion = c(0.33, 0.08, 0.22, 0.16, 0.45, 0.71, 0.01, 0.04),
  PopulationType = rep(c("Incarcerated", "General"), 4)
)

ggplot(race_data, aes(x = Race, y = Proportion, fill = PopulationType)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  scale_fill_manual(values = c("General" = "blue", "Incarcerated" = "red")) +
  scale_y_continuous(breaks = seq(0, 1, 0.25), limits = c(0, 1))+
  labs(title = "",
       x = "Race",
       y = "Proportion",
       fill = "Population Type") +
  theme_minimal()+
  theme(text=element_text(face="bold", size=20))

# current smoking

race_data <- data.frame(
  Race = rep(c("Black", "Hispanic", "White", "Asian"), each = 2),
  Count = c(188, 1310, 123, 2580, 254, 11214, 5, 598),
  Proportion = c(0.33, 0.08, 0.22, 0.16, 0.45, 0.71, 0.01, 0.04),
  PopulationType = rep(c("Incarcerated", "General"), 4)
)


# Visualize incarceration disparity by smoking  -----------

incarcerated <- 
  agent_dt[tick %in% selected_ticks & current_incarceration_status==1, 
           .N, 
           by=c("smoking_status")][,"prop":=round(N/sum(N), 3)][order(smoking_status)]

incarcerated$group <- "Incarcerated"

general_pop <- 
  agent_dt[tick %in% selected_ticks, 
           .N, 
           by=c("smoking_status")][,"prop":=round(N/sum(N), 3)][order(smoking_status)]
general_pop$group <- "General"

combined_data <- rbind(incarcerated, general_pop)
combined_data$smoking_status <- factor(combined_data$smoking_status, 
                                       levels = c("Current", "Former", "Never"))


ggplot(combined_data, aes(x = smoking_status, y = prop, fill = group)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  scale_fill_manual(values = c("General" = "blue", "Incarcerated" = "red")) +
  labs(title = "",
       x = "Smoking Status",
       y = "Proportion (%)",
       fill = "Population Type")+
  ylim(c(0,1))+
  theme_minimal() +
  #theme(text = element_text(size = 20, face = "bold"))
  theme(text = element_text(face = "bold"))

# Visualize incarceration disparity by alcohol status -----------

incarcerated2 <- 
  agent_dt[tick %in% selected_ticks & current_incarceration_status == 1, 
         .N, 
         by=c("alc_use_status")][,"prop":=round(N/sum(N), 3)][order(alc_use_status)]

incarcerated2$group <- "Incarcerated"

general_pop2 <- 
  agent_dt[tick %in% selected_ticks, 
           .N, 
           by=c("alc_use_status")][,"prop":=round(N/sum(N), 3)][order(alc_use_status)]
general_pop2$group <- "General"

combined_data2 <- rbind(incarcerated2, general_pop2)


ggplot(combined_data2, aes(x = alc_use_status, y = prop, fill = group)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  scale_fill_manual(values = c("General" = "blue", "Incarcerated" = "red")) +
  labs(title = "",
       x = "Alcohol Use",
       y = "Proportion (%)",
       fill = "Population Type")+
  ylim(c(0,1)) +
  theme_minimal() +
  #theme(text = element_text(size = 20, face = "bold"))
  theme()