# Analyze simulated demographic data
rm(list=ls())
# Load R environment ---------
renv::activate()
# Load packages ---------
library(here)
## here() starts at /users/akhann16/code/cadre/data-analysis-plotting/Simulated-Data-Analysis/r/agent-log-analysis
library(data.table)
library(yaml)
library(ggplot2)
# Read RDS file ------------
agent_log_env <- readRDS("/users/akhann16/code/cadre/data-analysis-plotting/Simulated-Data-Analysis/r/agent-log-analysis/rds-outs/agent_log_env.RDS")
# Load data ------------
agent_dt <- agent_log_env[["agent_dt"]]
input_params <- agent_log_env[["input_params"]] # THE INPUT PARAMS NEED TO BE ADDED FOR THE CODE TO WORK
head(agent_dt)
## tick id age race female alc_use_status smoking_status
## 1: 1 2610 84 Asian 1 0 Never
## 2: 1 7868 84 White 1 3 Former
## 3: 1 0 83 Hispanic 1 1 Former
## 4: 1 1 83 Hispanic 0 3 Former
## 5: 1 2 41 White 1 0 Never
## 6: 1 3 31 Hispanic 0 0 Never
## last_incarceration_tick last_release_tick current_incarceration_status
## 1: -1 -1 0
## 2: -1 -1 0
## 3: -1 -1 0
## 4: -1 -1 0
## 5: -1 -1 0
## 6: -1 -1 0
## entry_at_tick exit_at_tick n_incarcerations n_releases n_smkg_stat_trans
## 1: 0 1 0 0 0
## 2: 0 1 0 0 0
## 3: 0 -1 0 0 0
## 4: 0 -1 0 0 0
## 5: 0 -1 0 0 0
## 6: 0 -1 0 0 0
## n_alc_use_stat_trans
## 1: 0
## 2: 0
## 3: 0
## 4: 0
## 5: 0
## 6: 0
str(agent_dt)
## Classes 'data.table' and 'data.frame': 10965735 obs. of 16 variables:
## $ tick : int 1 1 1 1 1 1 1 1 1 1 ...
## $ id : int 2610 7868 0 1 2 3 4 5 6 7 ...
## $ age : int 84 84 83 83 41 31 23 48 77 21 ...
## $ race : chr "Asian" "White" "Hispanic" "Hispanic" ...
## $ female : int 1 1 1 0 1 0 1 0 1 0 ...
## $ alc_use_status : int 0 3 1 3 0 0 0 1 0 0 ...
## $ smoking_status : chr "Never" "Former" "Former" "Former" ...
## $ last_incarceration_tick : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ last_release_tick : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ current_incarceration_status: int 0 0 0 0 0 0 0 0 0 0 ...
## $ entry_at_tick : int 0 0 0 0 0 0 0 0 0 0 ...
## $ exit_at_tick : int 1 1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ n_incarcerations : int 0 0 0 0 0 0 0 0 0 0 ...
## $ n_releases : int 0 0 0 0 0 0 0 0 0 0 ...
## $ n_smkg_stat_trans : int 0 0 0 0 0 0 0 0 0 0 ...
## $ n_alc_use_stat_trans : int 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Last tick --------
last_tick <- max(agent_dt$tick)
print(last_tick)
## [1] 10950
# Compute metrics and plot --------
alc_use_last_tick <-
agent_dt[tick == last_tick, .N, by = c("alc_use_status")][,
"%" := round(N /sum(N), 3)][
order(alc_use_status)][
]
# targets
alc_use_props <- input_params$ALC_USE_PROPS
alc_use_targets <- c(alc_use_props$`0`, alc_use_props$`1`, alc_use_props$`2`, alc_use_props$`3`)
cbind(alc_use_last_tick[["%"]], alc_use_targets)
## alc_use_targets
## [1,] 0.353 0.35
## [2,] 0.553 0.55
## [3,] 0.047 0.05
## [4,] 0.047 0.05
# Calculate the percentages for alcohol use status
alc_use_dt <- agent_dt[tick == last_tick, .N, by = alc_use_status][, `:=` (percentage = round(N/sum(N)*100, 2))]
# Order the data table by alcohol use status
setorder(alc_use_dt, alc_use_status)
# Rename the alcohol use status codes to descriptive labels
alc_use_dt[, alc_use_status := factor(alc_use_status, levels = c(0, 1, 2, 3),
labels = c("Non-Drinking",
"Category I",
"Category II",
"Category III"))]
ggplot(alc_use_dt, aes(x = alc_use_status, y = percentage, fill = alc_use_status)) +
geom_bar(stat = "identity", color = "black") +
labs(title = "Agent Alcohol Use Status Distribution",
x = "Use Frequency",
y = "Percentage",
fill = "Alcohol Use Status") +
theme_minimal() +
guides(fill = "none") +
geom_text(aes(label = paste0(as.numeric(input_params$ALC_USE_PROPS)*100, "%")),
position = position_stack(vjust = 1.02))+
scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, 10))

# Plot over time
# calculate the distribution of alcohol states at the specified time ticks
time_ticks <- c(seq(1, last_tick, by = 10), last_tick)
alc_use_by_tick <-
agent_dt[tick %in% time_ticks, .(
"Non Drinking" = sum(alc_use_status == 0) / .N,
"Category I" = sum(alc_use_status == 1) / .N,
"Category II" = sum(alc_use_status == 2) / .N,
"Category III" = sum(alc_use_status== 3) / .N
),
by = tick]
# convert the data from wide to long format
alc_use_status_long <- melt(alc_use_by_tick,
id.vars = "tick",
variable.name = "alc_use_status",
value.name = "proportion")
# create a time series plot of race proportions
alc_use_time_plot <-
ggplot(alc_use_status_long,
aes(x = tick, y = proportion, color = alc_use_status, group = alc_use_status)) +
geom_line(linewidth=1.2) +
labs(title = "",
x = "Time (Days)",
y = "Proportion") +
scale_color_manual(values = c("#377eb8", "#ff7f00", "#4daf4a", "#e41a1c")) +
theme_minimal() +
theme(
legend.title = element_blank(),
axis.text.x = element_text(size = 14, face = "bold"),
axis.text.y = element_text(size = 14, face = "bold"),
legend.text = element_text(size = 14),
axis.title.x = element_text(size = 16, face = "bold"),
axis.title.y = element_text(size = 16, face = "bold")
) +
theme(legend.title = element_blank())+
scale_y_continuous(limits = c(0, 0.8), breaks = seq(0, 0.8, by = 0.1)) +
geom_text(aes(x = max(tick)/2, y = alc_use_targets[1] + 0.03,
label = sprintf("Target: %.2f", alc_use_targets[1])), color = "#377eb8", check_overlap = TRUE, size=5) +
geom_text(aes(x = max(tick)/2, y = alc_use_targets[2] + 0.03,
label = sprintf("Target: %.2f", alc_use_targets[2])), color = "#ff7f00", check_overlap = TRUE, size=5) +
geom_text(aes(x = max(tick)/2, y = alc_use_targets[3] + 0.03,
label = sprintf("Target: %.2f", alc_use_targets[3])), color = "#4daf4a", check_overlap = TRUE, size=5) +
geom_text(aes(x = max(tick)/2, y = alc_use_targets[4] - 0.03,
label = sprintf("Target: %.2f", alc_use_targets[4])), color = "#e41a1c", check_overlap = TRUE, size=5)
alc_use_time_plot
