demo <- demo %>%
mutate(gender = recode(gender, female = 'F', male = 'M'))
table(demo$gender)
##
## F M
## 4031 16405
demo = demo %>%
distinct(person_id, .keep_all = TRUE) # remove duplicates
demo$person_id <- as.double(demo$person_id)
case <- left_join(case, demo)
## Joining, by = "person_id"
case <- case %>%
separate(address, c('address', 'city'), sep = ",")
case$city <- toupper(case$city) # make everything upper-case
case <- case %>% filter(city == " CHICAGO")
nrow(case)
## [1] 25000
There are 25,000 observations.
case$age = time_length(difftime(case$arrest_date,case$bdate), "years")
case$age <- floor(case$age) # remove decimal points
case %>% select(arrest_date, bdate, age) %>% head(., 10)
## arrest_date bdate age
## 1 2012-01-04 1985-07-03 26
## 2 2012-07-11 1985-07-03 27
## 3 2013-04-04 1985-07-03 27
## 4 2012-03-31 1986-09-27 25
## 5 2012-12-09 1991-06-07 21
## 6 2012-02-25 1994-08-24 17
## 7 2012-10-06 1978-04-04 34
## 8 2013-04-06 1978-04-04 35
## 9 2012-01-12 1994-07-11 17
## 10 2013-09-25 1994-07-11 19
# First, I am going to include a new value in case for when the program ended, in October of 2013.
case$program_ended = "2013-10-31"
# It is unclear at what point of time the person has to be 18-24 years old since the 'age' value has multiple age values based on when they committed to crime throughout the program. So I am going to create a new column that includes the age of the person at the time the program ended, in October of 2013.
case$age_program.ended = time_length(difftime(case$program_ended,case$bdate), "years")
case$age_program.ended<- floor(case$age_program.ended)
# filter person id and age
age = case %>% select(person_id, age_program.ended)
age = age %>% group_by(person_id) %>% summarize(age_program.ended = mean(age_program.ended))
# merge with grades
grades <- left_join(grades, age)
## Joining, by = "person_id"
# filter for ages between 18 and 24 years old
grades_filtered = grades %>% filter(age_program.ended <= 24 &
age_program.ended>=18)
# rename grades to letters
grades_filtered = grades_filtered %>% mutate(across(
gr9_fall_math: gr10_spring_hist,
~recode(., A = '4', B = '3', C = '2', D = '1', F = '0' )
))
# make them numeric
grades_filtered = grades_filtered %>% mutate(across(, ~as.double(.)))
# but make person_id character
grades_filtered$person_id <- as.character(grades_filtered$person_id)
# find GPA. For 9th grade, it starts with "gr9".
grades_9th = grades_filtered %>% select(starts_with(c("person_id","gr9"))) %>%
pivot_longer(starts_with("gr9"), names_to = "course", values_to = "ClassGrade")
grades_9th %>% group_by(person_id) %>% summarize(GPA.9th.Grade = mean(ClassGrade, na.rm=TRUE)) %>% head(., 10)
## # A tibble: 10 × 2
## person_id GPA.9th.Grade
## <chr> <dbl>
## 1 10155 2.25
## 2 10194 3.14
## 3 10302 3.4
## 4 1042 2
## 5 10646 3.14
## 6 10670 1
## 7 10722 3.25
## 8 10735 2.57
## 9 10877 3.5
## 10 10965 3
# Find GPA. For 10th grade, it starts with "gr10"
grades_10th = grades_filtered %>% select(starts_with(c("person_id","gr10"))) %>%
pivot_longer(starts_with("gr10"), names_to = "course", values_to = "ClassGrade")
grades_10th %>% group_by(person_id) %>% summarize(GPA.10th.Grade = mean(ClassGrade, na.rm=TRUE)) %>% head(., 10)
## # A tibble: 10 × 2
## person_id GPA.10th.Grade
## <chr> <dbl>
## 1 10155 3.67
## 2 10194 3
## 3 10302 4
## 4 1042 3.57
## 5 10646 2.57
## 6 10670 2.57
## 7 10722 1.88
## 8 10735 2.25
## 9 10877 2.88
## 10 10965 3
Please reconstruct this variable using the prior_arrests.csv file. Assume that all of the individual’s arrests prior to the study period are contained in prior_arrest.csv.
If someone is not included in prior_arrests.csv, assume they had zero arrests at the arrest_date of the study period.
Also note that some individuals were arrested multiple times during the study period and that this should be accounted for in your prior arrest count. For example, if individual A was arrested 5 times prior to the study period and appears twice in the case file, their first arrest in the case file should have a prior arrest count of “5” and their second arrest should have a prior arrest count of “6”.
One final note, some people really do get arrested multiple times on the same day. Count each arrest separately, regardless of whether another arrest occurred on the same day.
case$arrest_date <- as.Date(case$arrest_date) # transform data
prior_arrests$arrest_date <- as.Date(prior_arrests$arrest_date,
format = "%m/%d/%y")
case = full_join(case, prior_arrests)
## Joining, by = c("person_id", "arrest_date")
case = case %>% group_by(person_id) %>% arrange(person_id, arrest_date) %>%
mutate(prior_arrests2 = row_number()-1) %>% filter(!is.na(caseid))
case %>% select(caseid, person_id, prior_arrests, prior_arrests2) %>% head(., 10)
## # A tibble: 10 × 4
## # Groups: person_id [6]
## caseid person_id prior_arrests prior_arrests2
## <int> <dbl> <int> <dbl>
## 1 57514 1 2 2
## 2 39970 1 3 3
## 3 88413 1 4 4
## 4 40216 5 2 2
## 5 92255 6 3 3
## 6 26516 7 0 0
## 7 2913 8 4 4
## 8 6304 8 5 5
## 9 82277 9 1 1
## 10 31881 9 2 2
# transform dates
case$arrest_date <- as.POSIXct(case$arrest_date, format="%Y-%m-%d")
case$dispos_date <- as.POSIXct(case$dispos_date, format="%Y-%m-%d")
case2 = case %>% select(-re_arrest) # remove re-arrest, reconstruct
###
case2 <- data.table(case2)
case2 = case2[case2, on = .(person_id = person_id, arrest_date<arrest_date,dispos_date>arrest_date),nomatch = NULL]
case2$re_arrest2 = "1" # 1 for re_arrests
case2 = case2 %>% distinct(caseid, .keep_all = TRUE)
case2= case2 %>% select(caseid,person_id,re_arrest2)
# merge with original case data
case= left_join(case, case2, by = c("caseid", "person_id"))
case$re_arrest2 <- as.double(case$re_arrest2) # make numeric
case <- case %>% mutate(re_arrest2 = replace_na(re_arrest2, 0)) # replace na with 0
case %>% select(caseid, person_id, arrest_date, dispos_date, re_arrest, re_arrest2) %>% head(., 10)
## # A tibble: 10 × 6
## # Groups: person_id [6]
## caseid person_id arrest_date dispos_date re_arrest re_arrest2
## <int> <dbl> <dttm> <dttm> <int> <dbl>
## 1 57514 1 2012-01-03 18:00:00 2012-03-27 00:00:00 0 0
## 2 39970 1 2012-07-10 19:00:00 2012-10-20 00:00:00 0 0
## 3 88413 1 2013-04-03 19:00:00 2013-06-22 00:00:00 0 0
## 4 40216 5 2012-03-30 19:00:00 2013-03-25 00:00:00 0 0
## 5 92255 6 2012-12-08 18:00:00 2013-11-09 00:00:00 0 0
## 6 26516 7 2012-02-24 18:00:00 2012-03-26 00:00:00 0 0
## 7 2913 8 2012-10-05 19:00:00 2013-12-29 00:00:00 1 1
## 8 6304 8 2013-04-05 19:00:00 2013-07-07 00:00:00 0 0
## 9 82277 9 2012-01-11 18:00:00 2012-11-08 00:00:00 0 0
## 10 31881 9 2013-09-24 19:00:00 2013-12-29 00:00:00 0 0
all(case$prior_arrests == case$prior_arrests2)
## [1] TRUE
all(case$re_arrest == case$re_arrest2)
## [1] FALSE
The original re_arrest and my re_arrest2 variable are not equal.
subset(case, case$re_arrest != case$re_arrest2) %>% select(caseid, person_id, arrest_date, dispos_date, re_arrest, re_arrest2) %>% arrange(person_id)
## # A tibble: 20 × 6
## # Groups: person_id [20]
## caseid person_id arrest_date dispos_date re_arrest re_arrest2
## <int> <dbl> <dttm> <dttm> <int> <dbl>
## 1 8935 805 2012-06-27 19:00:00 2012-08-03 00:00:00 0 1
## 2 6884 3979 2012-02-15 18:00:00 2012-04-28 00:00:00 0 1
## 3 54848 4203 2012-01-28 18:00:00 2012-08-01 00:00:00 0 1
## 4 13792 6618 2012-05-27 19:00:00 2012-07-09 00:00:00 0 1
## 5 7675 7027 2012-11-08 18:00:00 2012-12-14 00:00:00 0 1
## 6 14750 7220 2012-06-19 19:00:00 2012-12-14 00:00:00 0 1
## 7 63361 7734 2013-01-14 18:00:00 2013-06-05 00:00:00 0 1
## 8 18821 8845 2012-11-06 18:00:00 2013-07-16 00:00:00 0 1
## 9 90557 12442 2012-11-01 19:00:00 2012-12-24 00:00:00 0 1
## 10 74987 12675 2012-05-29 19:00:00 2013-01-21 00:00:00 0 1
## 11 75928 13443 2012-11-27 18:00:00 2013-09-14 00:00:00 0 1
## 12 93793 14088 2012-12-29 18:00:00 2013-10-05 00:00:00 0 1
## 13 21030 15329 2012-02-27 18:00:00 2012-04-03 00:00:00 0 1
## 14 30952 16018 2012-01-29 18:00:00 2012-08-26 00:00:00 0 1
## 15 36117 16233 2013-01-01 18:00:00 2013-02-06 00:00:00 0 1
## 16 33199 16599 2012-02-13 18:00:00 2012-06-19 00:00:00 0 1
## 17 75387 17606 2013-08-22 19:00:00 2013-09-09 00:00:00 0 1
## 18 86083 17678 2013-05-06 19:00:00 2013-08-13 00:00:00 0 1
## 19 56194 18807 2012-01-09 18:00:00 2012-03-29 00:00:00 0 1
## 20 52222 19696 2013-01-05 18:00:00 2013-01-25 00:00:00 0 1
Re-arrests 2 is more accurate. There are 20 false observatipns with the original “re_arrest” variable.
case %>% filter(person_id == "805") %>% select(caseid,arrest_date, dispos_date, re_arrest, re_arrest2)
## Adding missing grouping variables: `person_id`
## # A tibble: 4 × 6
## # Groups: person_id [1]
## person_id caseid arrest_date dispos_date re_arrest re_arrest2
## <dbl> <int> <dttm> <dttm> <int> <dbl>
## 1 805 46621 2012-02-16 18:00:00 2012-02-27 00:00:00 0 0
## 2 805 8935 2012-06-27 19:00:00 2012-08-03 00:00:00 0 1
## 3 805 59044 2012-08-02 19:00:00 2012-10-12 00:00:00 0 0
## 4 805 45487 2013-06-16 19:00:00 2014-01-26 00:00:00 0 0
For example, person id 805 was re-arrested while awaiting for disposition in case id 8935
case %>% filter(person_id == "3979") %>% select(caseid,arrest_date, dispos_date, re_arrest, re_arrest2)
## Adding missing grouping variables: `person_id`
## # A tibble: 3 × 6
## # Groups: person_id [1]
## person_id caseid arrest_date dispos_date re_arrest re_arrest2
## <dbl> <int> <dttm> <dttm> <int> <dbl>
## 1 3979 26497 2012-01-12 18:00:00 2012-02-09 00:00:00 0 0
## 2 3979 6884 2012-02-15 18:00:00 2012-04-28 00:00:00 0 1
## 3 3979 26822 2012-04-27 19:00:00 2012-07-25 00:00:00 0 0
person id 3979 was re-arrested while awaiting for disposition in case id 6884
Help the State’s Attorney’s Office determine if the program should be continued/expanded by estimating the program’s effect on re-arrests prior to disposition. Because we only have grades data for young adults, please do not use these data to inform your statistical analysis. To draw conclusions about this program’s effect, answer the following questions.
# Keeping only the unique persons
demographic = case %>% distinct(person_id, .keep_all = TRUE)
table(demographic$race)
##
## ASIAN BLACK WHITE
## 707 10441 3205
More African Americans than white and Asian in this study
table(demographic$gender)
##
## F M
## 2842 11511
significantly more males than females by 8.6K
summary(case$age,na.rm=TRUE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.00 24.00 29.00 29.82 35.00 70.00
Average age/median of ~29 years. Minimum age of 9 years, maximum age of 70 yrs
table1 = case %>% group_by(treat) %>%
summarize(
count = n(),
meanage = mean(age),
)
table1$meanage <- round(table1$meanage, 0)
DT::datatable(table1,
colnames = c( 'Treatment', 'Total Number', 'Mean Age(years)'))
table2 = case %>% group_by(treat, gender) %>%
summarize(
count = n()
)
## `summarise()` has grouped output by 'treat'. You can override using the
## `.groups` argument.
DT::datatable(table2,
colnames = c( 'Treatment', 'Gender', 'Total'))
table3 = case %>% group_by(treat, race) %>%
summarize(
count = n(),
)
## `summarise()` has grouped output by 'treat'. You can override using the
## `.groups` argument.
DT::datatable(table3,
colnames = c( 'Treatment', 'Race', 'Total'))
Treatment group have slightly higher numbers, are slightly older on average by ~3years, and have more males. However, the proportion is the same. It is important to keep in mind that each person can belong to both treatment and control if they committed multiple crimes!
ggplot(data = case) +
geom_bar(mapping = aes(x = race, fill = as.character(treat))) + facet_grid(~as.character(treat)) +
labs( x= "Race",
y= "Total number in each group",
title = "Treatment and control differences by race/ethnicity",
subtitle = " Separated by 0 = control, 1 = treatment")
case$treat <- as.factor(case$treat) # make treatment a factor
# this makes not participating in program the reference category
case = case %>% mutate(treat = fct_relevel(treat, "0"))
model = glm(re_arrest2 ~ treat + factor(race) + factor(gender) + age + prior_arrests2, data = case, family = "binomial"(link = 'logit'))
stargazer::stargazer(model, type = "text")
##
## =============================================
## Dependent variable:
## ---------------------------
## re_arrest2
## ---------------------------------------------
## treat1 -0.088***
## (0.033)
##
## factor(race)BLACK 0.037
## (0.073)
##
## factor(race)WHITE 0.025
## (0.079)
##
## factor(gender)M 0.027
## (0.040)
##
## age 0.022***
## (0.003)
##
## prior_arrests2 0.092***
## (0.010)
##
## Constant -2.353***
## (0.100)
##
## ---------------------------------------------
## Observations 25,000
## Log Likelihood -12,648.350
## Akaike Inf. Crit. 25,310.710
## =============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
exp(coef(model))
## (Intercept) treat1 factor(race)BLACK factor(race)WHITE
## 0.0951132 0.9159583 1.0378744 1.0253300
## factor(gender)M age prior_arrests2
## 1.0273674 1.0218710 1.0964953
Interpretation of the model Participating in the treatment program does have a significant effect on the likelihood of getting re-arrested before disposition. In fact, age and the number of prior arrests were also additional variables that have an effect on the likelihood of getting arrested before disposition.
The coefficient for treat1 tells us that while controlling for demographic variables and the number of prior arrests, participating in the program decreases the log odds of getting re-arrested while awaiting disposition by .088. With a p-value of 0.007, this is statistically significant at the .01 level **.
When we exponentiate the co-efficient, this tells us that the odds of re-arrest before disposition decreases by 8.40% for cases in which the person participates in the program compared to the cases that don’t. It is important to keep in mind that a person may be both in treatment and control if they committed multiple crimes.
The coefficients for age tells us that the odds of re-arrest before disposition increases by a factor of 1.02 for every additional year of age. The odds of re-arrest before disposition increases by a factor of 1.09 for every additional prior arrest.
Final Answer
Yes, Participating in the program reduces the likelihood of re-arrest before disposition!
summary(case$prior_arrests2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 4.000 3.798 5.000 16.000
served = case %>% filter(prior_arrests >=5,
age>=18,
treat == 0) %>% distinct(person_id) # important to keep in mind that while a person may not be treated in a current case, they may have been treated for another crime.
nrow(served)
## [1] 2216
Answer: Based on the logistic model, future implementation for the program should target older individuals, as the odds of re-arrest before disposition increase with age. The model also tells us that having more prior arrests increases the odds of re-arrest before disposition. This tells us that the program should target individuals with high number of prior-arrests, as these individuals have a higher likelihood of re-arrest before disposition than those with a low number of prior-arrests.
The summary statistics of prior arrests2 has a third quantile of 5. Therefore, I filtered the original case data to include individuals who has had 5 or more prior arrests, are 18 or older, and has not participated in the program. I would recommend the State’s Attorney Office to target 2216 individuals.
Race and gender are not predictor variables that explain re-arrest before disposition, so the State Attorney’s Office should not choose individuals on the basis of their gender and race.