Texas_Cohorts

Quarto

Quarto enables you to weave together content and executable code into a finished document. To learn more about Quarto see https://quarto.org.

#Importing Data

library(readxl)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(lubridate)


Attaching package: 'lubridate'

The following objects are masked from 'package:base':

    date, intersect, setdiff, union

library(ggplot2)
library(DT)

# Load Data
Texas_Pop_2016 <- read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2016/On Hand Inmate Data 0716.xlsx")
Texas_Pop_2017 <- read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2017/On Hand Inmate Data 0717.xlsx")
Texas_Pop_2018 <- read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2018/On Hand Inmate Data 0718.xlsx")
Texas_Pop_2019 <- read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2019/On Hand Inmate Data 0719.xlsx")
Texas_Pop_2020 <- read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2020/On Hand Inmate Data 0720.xlsx")
Texas_Pop_2021 <- read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2021/On Hand Inmate Data 0721.xlsx")
Texas_Pop_2022 <- read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2022/On Hand Inmate Data 0722.xlsx")
Texas_Pop_2023 <- read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2023/On Hand Inmate Data 0723.xlsx")
Texas_Pop_2024 <- read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2024/On Hand Inmate Data 0724.xlsx")

# Store datasets in a list
df_list <- list(Texas_Pop_2016, Texas_Pop_2017, Texas_Pop_2018, Texas_Pop_2019, Texas_Pop_2020, Texas_Pop_2021, Texas_Pop_2022, Texas_Pop_2023, Texas_Pop_2024)

# Assign correct cohort years
years <- 2016:2024
names(df_list) <- years

# Add snapshot year column to each dataset
for (i in seq_along(df_list)) {
  df_list[[i]]$Year <- years[i]
}

#Cleaning Date

# Function to standardize date columns before merging
# Define correct column names (update if necessary)
library(dplyr)
colnames(Texas_Pop_2016) <- tolower(colnames(Texas_Pop_2016))  # Convert to lowercase
colnames(Texas_Pop_2016) <- gsub("\\s+", "_", colnames(Texas_Pop_2016))

# Standardize column names manually
colnames(Texas_Pop_2017) <- tolower(colnames(Texas_Pop_2017))  # Convert to lowercase
colnames(Texas_Pop_2017) <- gsub("\\s+", "_", colnames(Texas_Pop_2017))

# Standardize column names manually
colnames(Texas_Pop_2018) <- tolower(colnames(Texas_Pop_2018))  # Convert to lowercase
colnames(Texas_Pop_2018) <- gsub("\\s+", "_", colnames(Texas_Pop_2018))

# Standardize column names manually
colnames(Texas_Pop_2019) <- tolower(colnames(Texas_Pop_2019))  # Convert to lowercase
colnames(Texas_Pop_2019) <- gsub("\\s+", "_", colnames(Texas_Pop_2019))

# Standardize column names manually
colnames(Texas_Pop_2020) <- tolower(colnames(Texas_Pop_2020))  # Convert to lowercase
colnames(Texas_Pop_2020) <- gsub("\\s+", "_", colnames(Texas_Pop_2020))

# Standardize column names manually
colnames(Texas_Pop_2021) <- tolower(colnames(Texas_Pop_2021))  # Convert to lowercase
colnames(Texas_Pop_2021) <- gsub("\\s+", "_", colnames(Texas_Pop_2021))

# Standardize column names manually
colnames(Texas_Pop_2022) <- tolower(colnames(Texas_Pop_2022))  # Convert to lowercase
colnames(Texas_Pop_2022) <- gsub("\\s+", "_", colnames(Texas_Pop_2022))

# Standardize column names manually
colnames(Texas_Pop_2023) <- tolower(colnames(Texas_Pop_2023))  # Convert to lowercase
colnames(Texas_Pop_2023) <- gsub("\\s+", "_", colnames(Texas_Pop_2023))

# Standardize column names manually
colnames(Texas_Pop_2024) <- tolower(colnames(Texas_Pop_2024))  # Convert to lowercase
colnames(Texas_Pop_2024) <- gsub("\\s+", "_", colnames(Texas_Pop_2024))

library(dplyr)
# Function to convert date columns to Date format
convert_dates <- function(df) {
  df %>%
    mutate(
      sentence_date = if ("sentence_date" %in% colnames(df)) as.Date(sentence_date, format = "%Y-%m-%d") else NA,
      projected_release = if ("projected_release" %in% colnames(df)) as.Date(projected_release, format = "%Y-%m-%d") else NA,
      maximum_sentence_date = if ("maximum_sentence_date" %in% colnames(df)) as.Date(maximum_sentence_date, format = "%Y-%m-%d") else NA,
      parole_eligibility_date = if ("parole_eligibility_date" %in% colnames(df)) as.Date(parole_eligibility_date, format = "%Y-%m-%d") else NA,
      next_parole_review_date = if ("next_parole_review_date" %in% colnames(df)) as.Date(next_parole_review_date, format = "%Y-%m-%d") else NA
    )
}

# List of datasets
df_list <- list(Texas_Pop_2016, Texas_Pop_2017, Texas_Pop_2018, Texas_Pop_2019, Texas_Pop_2020, Texas_Pop_2021, Texas_Pop_2022, Texas_Pop_2023, Texas_Pop_2024)

# Function to rename or remove duplicate columns
fix_duplicate_columns <- function(df) {
  colnames(df) <- make.unique(colnames(df))  # Make duplicate names unique
  return(df)
}

# Apply function to all datasets
df_list <- lapply(df_list, fix_duplicate_columns)

# Apply function to all datasets
df_list <- lapply(df_list, convert_dates)

# Reassign cleaned datasets back to original variables
Texas_Pop_2016 <- df_list[[1]]
Texas_Pop_2017 <- df_list[[2]]
Texas_Pop_2018 <- df_list[[3]]
Texas_Pop_2019 <- df_list[[4]]
Texas_Pop_2020 <- df_list[[5]]
Texas_Pop_2021 <- df_list[[6]]
Texas_Pop_2022 <- df_list[[7]]
Texas_Pop_2023 <- df_list[[8]]
Texas_Pop_2024 <- df_list[[9]]

# List of datasets with corresponding years
df_list <- list(
  "2016" = Texas_Pop_2016,
  "2017" = Texas_Pop_2017,
  "2018" = Texas_Pop_2018,
  "2019" = Texas_Pop_2019,
  "2020" = Texas_Pop_2020,
  "2021" = Texas_Pop_2021,
  "2022" = Texas_Pop_2022,
  "2023" = Texas_Pop_2023,
  "2024" = Texas_Pop_2024
)

# Loop through each dataset and add `cohort_year`
df_list <- lapply(names(df_list), function(year) {
  df <- df_list[[year]]  # Get the dataframe
  df$cohort_year <- as.numeric(year)  # Assign the correct cohort year
  return(df)
})

# Reassign cleaned datasets back to original variables
Texas_Pop_2016 <- df_list[[1]]
Texas_Pop_2017 <- df_list[[2]]
Texas_Pop_2018 <- df_list[[3]]
Texas_Pop_2019 <- df_list[[4]]
Texas_Pop_2020 <- df_list[[5]]
Texas_Pop_2021 <- df_list[[6]]
Texas_Pop_2022 <- df_list[[7]]
Texas_Pop_2023 <- df_list[[8]]
Texas_Pop_2024 <- df_list[[9]]

library(dplyr)
library(lubridate)  # Needed for year() function

# Function to extract sentenced_year and calculate incarceration stats
calculate_incarceration_stats <- function(df) {
  df %>%
    mutate(
      sentenced_year = year(sentence_date),  # Extract the year from sentence_date
      birth_year = cohort_year - age,  # Calculate birth year
      estimated_age_at_admission = sentenced_year - birth_year,  # Age at admission
      LOS_Years = cohort_year - sentenced_year  # Length of stay in years
    )
}

# List of datasets
df_list <- list(Texas_Pop_2016, Texas_Pop_2017, Texas_Pop_2018, Texas_Pop_2019, 
                Texas_Pop_2020, Texas_Pop_2021, Texas_Pop_2022, Texas_Pop_2023, Texas_Pop_2024)

# Apply function to all datasets
df_list <- lapply(df_list, calculate_incarceration_stats)

# Reassign modified datasets back to original variables
Texas_Pop_2016 <- df_list[[1]]
Texas_Pop_2017 <- df_list[[2]]
Texas_Pop_2018 <- df_list[[3]]
Texas_Pop_2019 <- df_list[[4]]
Texas_Pop_2020 <- df_list[[5]]
Texas_Pop_2021 <- df_list[[6]]
Texas_Pop_2022 <- df_list[[7]]
Texas_Pop_2023 <- df_list[[8]]
Texas_Pop_2024 <- df_list[[9]]

library(dplyr)

# Function to filter LPT (15+ years) and categorize age groups
process_lpt_data <- function(df) {
  df %>%
    filter(LOS_Years >= 15) %>%  # Filter only individuals serving 15+ years
    mutate(
      # Categorizing Age at Admission
      Admin_Age_Group = case_when(
        estimated_age_at_admission < 18 ~ "Youth",
        estimated_age_at_admission >= 18 & estimated_age_at_admission < 25 ~ "Emerging Adult",
        TRUE ~ "Other"
      ),
      
      # Categorizing Current Age
      Current_Age_Group = case_when(
        age >= 55 ~ "Aging Population",
        TRUE ~ "Other"
      )
    )
}
# List of datasets
df_list <- list(Texas_Pop_2016, Texas_Pop_2017, Texas_Pop_2018, Texas_Pop_2019, 
                Texas_Pop_2020, Texas_Pop_2021, Texas_Pop_2022, Texas_Pop_2023, Texas_Pop_2024)

# Apply function to all datasets
df_list <- lapply(df_list, process_lpt_data)

# Reassign filtered datasets back to original variables
Texas_Pop_2016 <- df_list[[1]]
Texas_Pop_2017 <- df_list[[2]]
Texas_Pop_2018 <- df_list[[3]]
Texas_Pop_2019 <- df_list[[4]]
Texas_Pop_2020 <- df_list[[5]]
Texas_Pop_2021 <- df_list[[6]]
Texas_Pop_2022 <- df_list[[7]]
Texas_Pop_2023 <- df_list[[8]]
Texas_Pop_2024 <- df_list[[9]]

library(dplyr)
library(lubridate)

# Define the cutoff date (July 1, 2024)
cutoff_date <- as.Date("2024-07-01")

# Combine all datasets into a single dataframe
texas_lpt_combined <- bind_rows(df_list, .id = "source_df")

# Ensure maximum_sentence_date is in Date format
texas_lpt_combined <- texas_lpt_combined %>%
  mutate(maximum_sentence_date = as.Date(maximum_sentence_date, format = "%Y-%m-%d"))

# Identify individuals who were serving an LPT at any time
lpt_individuals <- texas_lpt_combined %>%
  filter(LOS_Years >= 15) %>%
  select(tdcj_number, maximum_sentence_date) %>%
  distinct()

# Identify individuals whose max sentence date extends beyond July 1, 2024
still_incarcerated <- lpt_individuals %>%
  filter(maximum_sentence_date > cutoff_date) %>%
  select(tdcj_number)

# Identify individuals who are missing from the 2024 dataset and have a max sentence date before or on July 1, 2024
individuals_missing_in_2024 <- setdiff(lpt_individuals$tdcj_number, 
                                       texas_lpt_combined$tdcj_number[texas_lpt_combined$cohort_year == 2024])

released_or_deceased <- lpt_individuals %>%
  filter(tdcj_number %in% individuals_missing_in_2024 & maximum_sentence_date <= cutoff_date) %>%
  select(tdcj_number)

# Merge back to the combined dataset
texas_lpt_combined <- texas_lpt_combined %>%
  mutate(
    still_incarcerated = ifelse(tdcj_number %in% still_incarcerated$tdcj_number, TRUE, FALSE),
    released_or_deceased = ifelse(tdcj_number %in% released_or_deceased$tdcj_number, TRUE, FALSE)
  )

# Check summary
table(texas_lpt_combined$still_incarcerated)


 FALSE   TRUE 
 11418 147819

table(texas_lpt_combined$released_or_deceased)


 FALSE   TRUE 
147454  11783

library(dplyr)
library(ggplot2)
library(lubridate)

# Combine all datasets into a single dataframe
texas_lpt_combined <- bind_rows(df_list, .id = "source_df")

# Convert maximum sentence date to Date format
texas_lpt_combined <- texas_lpt_combined %>%
  mutate(maximum_sentence_date = as.Date(maximum_sentence_date, format = "%Y-%m-%d"))

# Identify individuals with maximum sentence date beyond July 1, 2024
texas_lpt_combined <- texas_lpt_combined %>%
  mutate(still_incarcerated_after_2024 = ifelse(maximum_sentence_date > as.Date("2024-07-01"), TRUE, FALSE))

# Summary of still incarcerated individuals
table(texas_lpt_combined$still_incarcerated_after_2024)


 FALSE   TRUE 
 11680 147432

# Count number of LPT individuals by cohort year
lpt_growth <- texas_lpt_combined %>%
  group_by(cohort_year) %>%
  summarize(total_LPT = n())

# Plot the trend of LPT cases over time
ggplot(lpt_growth, aes(x = cohort_year, y = total_LPT)) +
  geom_line() +
  geom_point() +
  labs(title = "Growth of Long Prison Terms Over Time",
       x = "Cohort Year",
       y = "Number of LPT Cases") +
  theme_minimal()

# Race distribution among LPTs
lpt_race_dist <- texas_lpt_combined %>%
  filter(still_incarcerated_after_2024) %>%
  group_by(race) %>%
  summarize(count = n())

# Gender distribution among LPTs
lpt_gender_dist <- texas_lpt_combined %>%
  filter(still_incarcerated_after_2024) %>%
  group_by(gender) %>%
  summarize(count = n())

# Plot race distribution
ggplot(lpt_race_dist, aes(x = reorder(race, -count), y = count, fill = race)) +
  geom_bar(stat = "identity") +
  labs(title = "Race Distribution of LPT Cases",
       x = "Race",
       y = "Count") +
  theme_minimal() +
  theme(legend.position = "none")

# Plot gender distribution
ggplot(lpt_gender_dist, aes(x = gender, y = count, fill = gender)) +
  geom_bar(stat = "identity") +
  labs(title = "Gender Distribution of LPT Cases",
       x = "Gender",
       y = "Count") +
  theme_minimal() +
  theme(legend.position = "none")

library(dplyr)
library(ggplot2)

# Aggregate count of LPT individuals by cohort year and age group at admission
lpt_by_age_at_admission <- texas_lpt_combined %>%
  filter(!is.na(Admin_Age_Group)) %>%  # Remove missing values if any
  group_by(cohort_year, Admin_Age_Group) %>%
  summarize(count = n(), .groups = "drop")

# Plot LPT growth by age at admission over time
ggplot(lpt_by_age_at_admission, aes(x = cohort_year, y = count, color = Admin_Age_Group, group = Admin_Age_Group)) +
  geom_line(size = 1) +
  geom_point(size = 2) +  # Add points for better readability
  scale_x_continuous(breaks = seq(min(lpt_by_age_at_admission$cohort_year, na.rm = TRUE), 
                                  max(lpt_by_age_at_admission$cohort_year, na.rm = TRUE), 1)) +  # Ensure yearly intervals
  labs(title = "LPT Growth by Age at Admission (2016-2024)",
       x = "Cohort Year", 
       y = "Number of People",
       color = "Age at Admission") +
  theme_minimal() +
  theme(legend.position = "right")  # Adjust legend position for readability

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

library(dplyr)
library(ggplot2)

# Aggregate count of LPT individuals by cohort year and current age group
lpt_by_current_age <- texas_lpt_combined %>%
  filter(!is.na(Current_Age_Group)) %>%  # Remove missing values if any
  group_by(cohort_year, Current_Age_Group) %>%
  summarize(count = n(), .groups = "drop")

# Plot LPT growth by current age over time
ggplot(lpt_by_current_age, aes(x = cohort_year, y = count, color = Current_Age_Group, group = Current_Age_Group)) +
  geom_line(size = 1) +
  geom_point(size = 2) +  # Add points for better readability
  scale_x_continuous(breaks = seq(min(lpt_by_current_age$cohort_year, na.rm = TRUE), 
                                  max(lpt_by_current_age$cohort_year, na.rm = TRUE), 1)) +  # Ensure yearly intervals
  labs(title = "LPT Growth by Current Age (2016-2024)",
       x = "Cohort Year", 
       y = "Number of People",
       color = "Current Age Group") +
  theme_minimal() +
  theme(legend.position = "right")  # Adjust legend position for better visualization