Quarto enables you to weave together content and executable code into a finished document. To learn more about Quarto see https://quarto.org.
#Importing Data
library(readxl)library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(lubridate)
Attaching package: 'lubridate'
The following objects are masked from 'package:base':
date, intersect, setdiff, union
library(ggplot2)library(DT)# Load DataTexas_Pop_2016 <-read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2016/On Hand Inmate Data 0716.xlsx")Texas_Pop_2017 <-read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2017/On Hand Inmate Data 0717.xlsx")Texas_Pop_2018 <-read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2018/On Hand Inmate Data 0718.xlsx")Texas_Pop_2019 <-read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2019/On Hand Inmate Data 0719.xlsx")Texas_Pop_2020 <-read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2020/On Hand Inmate Data 0720.xlsx")Texas_Pop_2021 <-read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2021/On Hand Inmate Data 0721.xlsx")Texas_Pop_2022 <-read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2022/On Hand Inmate Data 0722.xlsx")Texas_Pop_2023 <-read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2023/On Hand Inmate Data 0723.xlsx")Texas_Pop_2024 <-read_excel("~/Library/CloudStorage/Box-Box/Company/Working files/LPT & Aging in Prison/Texas/Texas.Data.PIA/CY2024/On Hand Inmate Data 0724.xlsx")# Store datasets in a listdf_list <-list(Texas_Pop_2016, Texas_Pop_2017, Texas_Pop_2018, Texas_Pop_2019, Texas_Pop_2020, Texas_Pop_2021, Texas_Pop_2022, Texas_Pop_2023, Texas_Pop_2024)# Assign correct cohort yearsyears <-2016:2024names(df_list) <- years# Add snapshot year column to each datasetfor (i inseq_along(df_list)) { df_list[[i]]$Year <- years[i]}
#Cleaning Date
# Function to standardize date columns before merging# Define correct column names (update if necessary)library(dplyr)colnames(Texas_Pop_2016) <-tolower(colnames(Texas_Pop_2016)) # Convert to lowercasecolnames(Texas_Pop_2016) <-gsub("\\s+", "_", colnames(Texas_Pop_2016))# Standardize column names manuallycolnames(Texas_Pop_2017) <-tolower(colnames(Texas_Pop_2017)) # Convert to lowercasecolnames(Texas_Pop_2017) <-gsub("\\s+", "_", colnames(Texas_Pop_2017))# Standardize column names manuallycolnames(Texas_Pop_2018) <-tolower(colnames(Texas_Pop_2018)) # Convert to lowercasecolnames(Texas_Pop_2018) <-gsub("\\s+", "_", colnames(Texas_Pop_2018))# Standardize column names manuallycolnames(Texas_Pop_2019) <-tolower(colnames(Texas_Pop_2019)) # Convert to lowercasecolnames(Texas_Pop_2019) <-gsub("\\s+", "_", colnames(Texas_Pop_2019))# Standardize column names manuallycolnames(Texas_Pop_2020) <-tolower(colnames(Texas_Pop_2020)) # Convert to lowercasecolnames(Texas_Pop_2020) <-gsub("\\s+", "_", colnames(Texas_Pop_2020))# Standardize column names manuallycolnames(Texas_Pop_2021) <-tolower(colnames(Texas_Pop_2021)) # Convert to lowercasecolnames(Texas_Pop_2021) <-gsub("\\s+", "_", colnames(Texas_Pop_2021))# Standardize column names manuallycolnames(Texas_Pop_2022) <-tolower(colnames(Texas_Pop_2022)) # Convert to lowercasecolnames(Texas_Pop_2022) <-gsub("\\s+", "_", colnames(Texas_Pop_2022))# Standardize column names manuallycolnames(Texas_Pop_2023) <-tolower(colnames(Texas_Pop_2023)) # Convert to lowercasecolnames(Texas_Pop_2023) <-gsub("\\s+", "_", colnames(Texas_Pop_2023))# Standardize column names manuallycolnames(Texas_Pop_2024) <-tolower(colnames(Texas_Pop_2024)) # Convert to lowercasecolnames(Texas_Pop_2024) <-gsub("\\s+", "_", colnames(Texas_Pop_2024))
library(dplyr)# Function to convert date columns to Date formatconvert_dates <-function(df) { df %>%mutate(sentence_date =if ("sentence_date"%in%colnames(df)) as.Date(sentence_date, format ="%Y-%m-%d") elseNA,projected_release =if ("projected_release"%in%colnames(df)) as.Date(projected_release, format ="%Y-%m-%d") elseNA,maximum_sentence_date =if ("maximum_sentence_date"%in%colnames(df)) as.Date(maximum_sentence_date, format ="%Y-%m-%d") elseNA,parole_eligibility_date =if ("parole_eligibility_date"%in%colnames(df)) as.Date(parole_eligibility_date, format ="%Y-%m-%d") elseNA,next_parole_review_date =if ("next_parole_review_date"%in%colnames(df)) as.Date(next_parole_review_date, format ="%Y-%m-%d") elseNA )}
# List of datasetsdf_list <-list(Texas_Pop_2016, Texas_Pop_2017, Texas_Pop_2018, Texas_Pop_2019, Texas_Pop_2020, Texas_Pop_2021, Texas_Pop_2022, Texas_Pop_2023, Texas_Pop_2024)
# Function to rename or remove duplicate columnsfix_duplicate_columns <-function(df) {colnames(df) <-make.unique(colnames(df)) # Make duplicate names uniquereturn(df)}# Apply function to all datasetsdf_list <-lapply(df_list, fix_duplicate_columns)
# Apply function to all datasetsdf_list <-lapply(df_list, convert_dates)# Reassign cleaned datasets back to original variablesTexas_Pop_2016 <- df_list[[1]]Texas_Pop_2017 <- df_list[[2]]Texas_Pop_2018 <- df_list[[3]]Texas_Pop_2019 <- df_list[[4]]Texas_Pop_2020 <- df_list[[5]]Texas_Pop_2021 <- df_list[[6]]Texas_Pop_2022 <- df_list[[7]]Texas_Pop_2023 <- df_list[[8]]Texas_Pop_2024 <- df_list[[9]]
# List of datasets with corresponding yearsdf_list <-list("2016"= Texas_Pop_2016,"2017"= Texas_Pop_2017,"2018"= Texas_Pop_2018,"2019"= Texas_Pop_2019,"2020"= Texas_Pop_2020,"2021"= Texas_Pop_2021,"2022"= Texas_Pop_2022,"2023"= Texas_Pop_2023,"2024"= Texas_Pop_2024)# Loop through each dataset and add `cohort_year`df_list <-lapply(names(df_list), function(year) { df <- df_list[[year]] # Get the dataframe df$cohort_year <-as.numeric(year) # Assign the correct cohort yearreturn(df)})# Reassign cleaned datasets back to original variablesTexas_Pop_2016 <- df_list[[1]]Texas_Pop_2017 <- df_list[[2]]Texas_Pop_2018 <- df_list[[3]]Texas_Pop_2019 <- df_list[[4]]Texas_Pop_2020 <- df_list[[5]]Texas_Pop_2021 <- df_list[[6]]Texas_Pop_2022 <- df_list[[7]]Texas_Pop_2023 <- df_list[[8]]Texas_Pop_2024 <- df_list[[9]]
library(dplyr)library(lubridate) # Needed for year() function# Function to extract sentenced_year and calculate incarceration statscalculate_incarceration_stats <-function(df) { df %>%mutate(sentenced_year =year(sentence_date), # Extract the year from sentence_datebirth_year = cohort_year - age, # Calculate birth yearestimated_age_at_admission = sentenced_year - birth_year, # Age at admissionLOS_Years = cohort_year - sentenced_year # Length of stay in years )}# List of datasetsdf_list <-list(Texas_Pop_2016, Texas_Pop_2017, Texas_Pop_2018, Texas_Pop_2019, Texas_Pop_2020, Texas_Pop_2021, Texas_Pop_2022, Texas_Pop_2023, Texas_Pop_2024)# Apply function to all datasetsdf_list <-lapply(df_list, calculate_incarceration_stats)# Reassign modified datasets back to original variablesTexas_Pop_2016 <- df_list[[1]]Texas_Pop_2017 <- df_list[[2]]Texas_Pop_2018 <- df_list[[3]]Texas_Pop_2019 <- df_list[[4]]Texas_Pop_2020 <- df_list[[5]]Texas_Pop_2021 <- df_list[[6]]Texas_Pop_2022 <- df_list[[7]]Texas_Pop_2023 <- df_list[[8]]Texas_Pop_2024 <- df_list[[9]]
library(dplyr)# Function to filter LPT (15+ years) and categorize age groupsprocess_lpt_data <-function(df) { df %>%filter(LOS_Years >=15) %>%# Filter only individuals serving 15+ yearsmutate(# Categorizing Age at AdmissionAdmin_Age_Group =case_when( estimated_age_at_admission <18~"Youth", estimated_age_at_admission >=18& estimated_age_at_admission <25~"Emerging Adult",TRUE~"Other" ),# Categorizing Current AgeCurrent_Age_Group =case_when( age >=55~"Aging Population",TRUE~"Other" ) )}# List of datasetsdf_list <-list(Texas_Pop_2016, Texas_Pop_2017, Texas_Pop_2018, Texas_Pop_2019, Texas_Pop_2020, Texas_Pop_2021, Texas_Pop_2022, Texas_Pop_2023, Texas_Pop_2024)# Apply function to all datasetsdf_list <-lapply(df_list, process_lpt_data)# Reassign filtered datasets back to original variablesTexas_Pop_2016 <- df_list[[1]]Texas_Pop_2017 <- df_list[[2]]Texas_Pop_2018 <- df_list[[3]]Texas_Pop_2019 <- df_list[[4]]Texas_Pop_2020 <- df_list[[5]]Texas_Pop_2021 <- df_list[[6]]Texas_Pop_2022 <- df_list[[7]]Texas_Pop_2023 <- df_list[[8]]Texas_Pop_2024 <- df_list[[9]]
library(dplyr)library(lubridate)# Define the cutoff date (July 1, 2024)cutoff_date <-as.Date("2024-07-01")# Combine all datasets into a single dataframetexas_lpt_combined <-bind_rows(df_list, .id ="source_df")# Ensure maximum_sentence_date is in Date formattexas_lpt_combined <- texas_lpt_combined %>%mutate(maximum_sentence_date =as.Date(maximum_sentence_date, format ="%Y-%m-%d"))# Identify individuals who were serving an LPT at any timelpt_individuals <- texas_lpt_combined %>%filter(LOS_Years >=15) %>%select(tdcj_number, maximum_sentence_date) %>%distinct()# Identify individuals whose max sentence date extends beyond July 1, 2024still_incarcerated <- lpt_individuals %>%filter(maximum_sentence_date > cutoff_date) %>%select(tdcj_number)# Identify individuals who are missing from the 2024 dataset and have a max sentence date before or on July 1, 2024individuals_missing_in_2024 <-setdiff(lpt_individuals$tdcj_number, texas_lpt_combined$tdcj_number[texas_lpt_combined$cohort_year ==2024])released_or_deceased <- lpt_individuals %>%filter(tdcj_number %in% individuals_missing_in_2024 & maximum_sentence_date <= cutoff_date) %>%select(tdcj_number)# Merge back to the combined datasettexas_lpt_combined <- texas_lpt_combined %>%mutate(still_incarcerated =ifelse(tdcj_number %in% still_incarcerated$tdcj_number, TRUE, FALSE),released_or_deceased =ifelse(tdcj_number %in% released_or_deceased$tdcj_number, TRUE, FALSE) )# Check summarytable(texas_lpt_combined$still_incarcerated)
FALSE TRUE
11418 147819
table(texas_lpt_combined$released_or_deceased)
FALSE TRUE
147454 11783
library(dplyr)library(ggplot2)library(lubridate)# Combine all datasets into a single dataframetexas_lpt_combined <-bind_rows(df_list, .id ="source_df")# Convert maximum sentence date to Date formattexas_lpt_combined <- texas_lpt_combined %>%mutate(maximum_sentence_date =as.Date(maximum_sentence_date, format ="%Y-%m-%d"))# Identify individuals with maximum sentence date beyond July 1, 2024texas_lpt_combined <- texas_lpt_combined %>%mutate(still_incarcerated_after_2024 =ifelse(maximum_sentence_date >as.Date("2024-07-01"), TRUE, FALSE))# Summary of still incarcerated individualstable(texas_lpt_combined$still_incarcerated_after_2024)
FALSE TRUE
11680 147432
# Count number of LPT individuals by cohort yearlpt_growth <- texas_lpt_combined %>%group_by(cohort_year) %>%summarize(total_LPT =n())# Plot the trend of LPT cases over timeggplot(lpt_growth, aes(x = cohort_year, y = total_LPT)) +geom_line() +geom_point() +labs(title ="Growth of Long Prison Terms Over Time",x ="Cohort Year",y ="Number of LPT Cases") +theme_minimal()
# Race distribution among LPTslpt_race_dist <- texas_lpt_combined %>%filter(still_incarcerated_after_2024) %>%group_by(race) %>%summarize(count =n())# Gender distribution among LPTslpt_gender_dist <- texas_lpt_combined %>%filter(still_incarcerated_after_2024) %>%group_by(gender) %>%summarize(count =n())# Plot race distributionggplot(lpt_race_dist, aes(x =reorder(race, -count), y = count, fill = race)) +geom_bar(stat ="identity") +labs(title ="Race Distribution of LPT Cases",x ="Race",y ="Count") +theme_minimal() +theme(legend.position ="none")
# Plot gender distributionggplot(lpt_gender_dist, aes(x = gender, y = count, fill = gender)) +geom_bar(stat ="identity") +labs(title ="Gender Distribution of LPT Cases",x ="Gender",y ="Count") +theme_minimal() +theme(legend.position ="none")
library(dplyr)library(ggplot2)# Aggregate count of LPT individuals by cohort year and age group at admissionlpt_by_age_at_admission <- texas_lpt_combined %>%filter(!is.na(Admin_Age_Group)) %>%# Remove missing values if anygroup_by(cohort_year, Admin_Age_Group) %>%summarize(count =n(), .groups ="drop")# Plot LPT growth by age at admission over timeggplot(lpt_by_age_at_admission, aes(x = cohort_year, y = count, color = Admin_Age_Group, group = Admin_Age_Group)) +geom_line(size =1) +geom_point(size =2) +# Add points for better readabilityscale_x_continuous(breaks =seq(min(lpt_by_age_at_admission$cohort_year, na.rm =TRUE), max(lpt_by_age_at_admission$cohort_year, na.rm =TRUE), 1)) +# Ensure yearly intervalslabs(title ="LPT Growth by Age at Admission (2016-2024)",x ="Cohort Year", y ="Number of People",color ="Age at Admission") +theme_minimal() +theme(legend.position ="right") # Adjust legend position for readability
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
library(dplyr)library(ggplot2)# Aggregate count of LPT individuals by cohort year and current age grouplpt_by_current_age <- texas_lpt_combined %>%filter(!is.na(Current_Age_Group)) %>%# Remove missing values if anygroup_by(cohort_year, Current_Age_Group) %>%summarize(count =n(), .groups ="drop")# Plot LPT growth by current age over timeggplot(lpt_by_current_age, aes(x = cohort_year, y = count, color = Current_Age_Group, group = Current_Age_Group)) +geom_line(size =1) +geom_point(size =2) +# Add points for better readabilityscale_x_continuous(breaks =seq(min(lpt_by_current_age$cohort_year, na.rm =TRUE), max(lpt_by_current_age$cohort_year, na.rm =TRUE), 1)) +# Ensure yearly intervalslabs(title ="LPT Growth by Current Age (2016-2024)",x ="Cohort Year", y ="Number of People",color ="Current Age Group") +theme_minimal() +theme(legend.position ="right") # Adjust legend position for better visualization