This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
head(compas_scores_two_years)

# Data Cleaning and Preparation
data <- compas_scores_two_years %>%
  select(id, sex, age, race, priors_count...9, is_recid) %>%
  filter(!is.na(is_recid))

# Split into training and testing sets (75% training data) #learned about logistic training from various sources including https://stackoverflow.com/questions/58617114/logistic-regression-training-and-test-data, https://stats.stackexchange.com/questions/94770/when-to-divide-data-into-training-test-set-in-logistic-regression, https://www.datacamp.com/tutorial/logistic-regression-R
set.seed(8675309)
training_rows <- sample(nrow(data), 0.75 * nrow(data))
train_data <- data[training_rows, ]
test_data <- data[-training_rows, ]

# Fit Logistic Regression Model on Training Data
model <- glm(is_recid ~ age + race + priors_count...9, data = train_data, family = "binomial")

# Predict on Testing Data
test_data$predicted_recid <- predict(model, newdata = test_data, type = "response")

# Convert probabilities to binary outcomes
test_data$predicted_class <- ifelse(test_data$predicted_recid > 0.5, 1, 0)

# Assess Model Performance(Here we calculate the overall accuracy)
correct_predictions <- test_data$predicted_class == test_data$is_recid
accuracy <- sum(correct_predictions) / length(correct_predictions)
print(paste("Accuracy: ", accuracy))
[1] "Accuracy:  0.667960088691796"
# Assess Model Performance Across Races
accuracy_by_race <- test_data %>%
  group_by(race) %>%
  summarise(
    count = n(),
    correct = sum(predicted_class == is_recid),
    accuracy = correct / count
  )

print(accuracy_by_race)

# To visualize the results:
library(ggplot2)
Warning: package ‘ggplot2’ was built under R version 4.3.2
cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

ggplot(accuracy_by_race, aes(x = race, y = accuracy, fill = race)) +
  geom_bar(stat = "identity") +
  scale_colour_manual(values=cbPalette) +
  theme_minimal() +
  labs(title = "Accuracy of Recidivism Prediction by Race")


#to try and understand the graphh above we look at predicted rate vs actual recidivism
average_scores <- test_data %>%
  group_by(race) %>%
  summarise(avg_predicted_class = mean(as.numeric(predicted_class), na.rm = TRUE),
            actual_recidivism_rate = mean((is_recid), na.rm = TRUE))

#Compare these averages
average_scores

#anova test
anova_result <- aov(is_recid ~ race, data = compas_scores_two_years)
summary(anova_result)
              Df Sum Sq Mean Sq F value Pr(>F)    
race           5   38.6   7.720   31.58 <2e-16 ***
Residuals   7208 1762.3   0.244                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#p values for model
summary(model)

Call:
glm(formula = is_recid ~ age + race + priors_count...9, family = "binomial", 
    data = train_data)

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)          1.173914   0.095492  12.293  < 2e-16 ***
age                 -0.048970   0.002800 -17.490  < 2e-16 ***
raceAsian           -0.121598   0.453322  -0.268  0.78852    
raceCaucasian       -0.124821   0.066850  -1.867  0.06188 .  
raceHispanic        -0.331490   0.108192  -3.064  0.00218 ** 
raceNative American  0.164299   0.691657   0.238  0.81223    
raceOther           -0.265813   0.133867  -1.986  0.04707 *  
priors_count...9     0.163044   0.008224  19.824  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 7493.5  on 5409  degrees of freedom
Residual deviance: 6643.1  on 5402  degrees of freedom
AIC: 6659.1

Number of Fisher Scoring iterations: 4

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKVGhpcyBpcyBhbiBbUiBNYXJrZG93bl0oaHR0cDovL3JtYXJrZG93bi5yc3R1ZGlvLmNvbSkgTm90ZWJvb2suIFdoZW4geW91IGV4ZWN1dGUgY29kZSB3aXRoaW4gdGhlIG5vdGVib29rLCB0aGUgcmVzdWx0cyBhcHBlYXIgYmVuZWF0aCB0aGUgY29kZS4gCgpUcnkgZXhlY3V0aW5nIHRoaXMgY2h1bmsgYnkgY2xpY2tpbmcgdGhlICpSdW4qIGJ1dHRvbiB3aXRoaW4gdGhlIGNodW5rIG9yIGJ5IHBsYWNpbmcgeW91ciBjdXJzb3IgaW5zaWRlIGl0IGFuZCBwcmVzc2luZyAqQ21kK1NoaWZ0K0VudGVyKi4gCgpgYGB7cn0KbGlicmFyeShkcGx5cikKCmhlYWQoY29tcGFzX3Njb3Jlc190d29feWVhcnMpCgojIERhdGEgQ2xlYW5pbmcgYW5kIFByZXBhcmF0aW9uCmRhdGEgPC0gY29tcGFzX3Njb3Jlc190d29feWVhcnMgJT4lCiAgc2VsZWN0KGlkLCBzZXgsIGFnZSwgcmFjZSwgcHJpb3JzX2NvdW50Li4uOSwgaXNfcmVjaWQpICU+JQogIGZpbHRlcighaXMubmEoaXNfcmVjaWQpKQoKIyBTcGxpdCBpbnRvIHRyYWluaW5nIGFuZCB0ZXN0aW5nIHNldHMgKDc1JSB0cmFpbmluZyBkYXRhKSAjbGVhcm5lZCBhYm91dCBsb2dpc3RpYyB0cmFpbmluZyBmcm9tIHZhcmlvdXMgc291cmNlcyBpbmNsdWRpbmcgaHR0cHM6Ly9zdGFja292ZXJmbG93LmNvbS9xdWVzdGlvbnMvNTg2MTcxMTQvbG9naXN0aWMtcmVncmVzc2lvbi10cmFpbmluZy1hbmQtdGVzdC1kYXRhLCBodHRwczovL3N0YXRzLnN0YWNrZXhjaGFuZ2UuY29tL3F1ZXN0aW9ucy85NDc3MC93aGVuLXRvLWRpdmlkZS1kYXRhLWludG8tdHJhaW5pbmctdGVzdC1zZXQtaW4tbG9naXN0aWMtcmVncmVzc2lvbiwgaHR0cHM6Ly93d3cuZGF0YWNhbXAuY29tL3R1dG9yaWFsL2xvZ2lzdGljLXJlZ3Jlc3Npb24tUgpzZXQuc2VlZCg4Njc1MzA5KQp0cmFpbmluZ19yb3dzIDwtIHNhbXBsZShucm93KGRhdGEpLCAwLjc1ICogbnJvdyhkYXRhKSkKdHJhaW5fZGF0YSA8LSBkYXRhW3RyYWluaW5nX3Jvd3MsIF0KdGVzdF9kYXRhIDwtIGRhdGFbLXRyYWluaW5nX3Jvd3MsIF0KCiMgRml0IExvZ2lzdGljIFJlZ3Jlc3Npb24gTW9kZWwgb24gVHJhaW5pbmcgRGF0YQptb2RlbCA8LSBnbG0oaXNfcmVjaWQgfiBhZ2UgKyByYWNlICsgcHJpb3JzX2NvdW50Li4uOSwgZGF0YSA9IHRyYWluX2RhdGEsIGZhbWlseSA9ICJiaW5vbWlhbCIpCgojIFByZWRpY3Qgb24gVGVzdGluZyBEYXRhCnRlc3RfZGF0YSRwcmVkaWN0ZWRfcmVjaWQgPC0gcHJlZGljdChtb2RlbCwgbmV3ZGF0YSA9IHRlc3RfZGF0YSwgdHlwZSA9ICJyZXNwb25zZSIpCgojIENvbnZlcnQgcHJvYmFiaWxpdGllcyB0byBiaW5hcnkgb3V0Y29tZXMKdGVzdF9kYXRhJHByZWRpY3RlZF9jbGFzcyA8LSBpZmVsc2UodGVzdF9kYXRhJHByZWRpY3RlZF9yZWNpZCA+IDAuNSwgMSwgMCkKCiMgQXNzZXNzIE1vZGVsIFBlcmZvcm1hbmNlKEhlcmUgd2UgY2FsY3VsYXRlIHRoZSBvdmVyYWxsIGFjY3VyYWN5KQpjb3JyZWN0X3ByZWRpY3Rpb25zIDwtIHRlc3RfZGF0YSRwcmVkaWN0ZWRfY2xhc3MgPT0gdGVzdF9kYXRhJGlzX3JlY2lkCmFjY3VyYWN5IDwtIHN1bShjb3JyZWN0X3ByZWRpY3Rpb25zKSAvIGxlbmd0aChjb3JyZWN0X3ByZWRpY3Rpb25zKQpwcmludChwYXN0ZSgiQWNjdXJhY3k6ICIsIGFjY3VyYWN5KSkKCiMgQXNzZXNzIE1vZGVsIFBlcmZvcm1hbmNlIEFjcm9zcyBSYWNlcwphY2N1cmFjeV9ieV9yYWNlIDwtIHRlc3RfZGF0YSAlPiUKICBncm91cF9ieShyYWNlKSAlPiUKICBzdW1tYXJpc2UoCiAgICBjb3VudCA9IG4oKSwKICAgIGNvcnJlY3QgPSBzdW0ocHJlZGljdGVkX2NsYXNzID09IGlzX3JlY2lkKSwKICAgIGFjY3VyYWN5ID0gY29ycmVjdCAvIGNvdW50CiAgKQoKcHJpbnQoYWNjdXJhY3lfYnlfcmFjZSkKCiMgVG8gdmlzdWFsaXplIHRoZSByZXN1bHRzOgpsaWJyYXJ5KGdncGxvdDIpCmNiUGFsZXR0ZSA8LSBjKCIjMDAwMDAwIiwgIiNFNjlGMDAiLCAiIzU2QjRFOSIsICIjMDA5RTczIiwgIiNGMEU0NDIiLCAiIzAwNzJCMiIsICIjRDU1RTAwIiwgIiNDQzc5QTciKQoKZ2dwbG90KGFjY3VyYWN5X2J5X3JhY2UsIGFlcyh4ID0gcmFjZSwgeSA9IGFjY3VyYWN5LCBmaWxsID0gcmFjZSkpICsKICBnZW9tX2JhcihzdGF0ID0gImlkZW50aXR5IikgKwogIHNjYWxlX2NvbG91cl9tYW51YWwodmFsdWVzPWNiUGFsZXR0ZSkgKwogIHRoZW1lX21pbmltYWwoKSArCiAgbGFicyh0aXRsZSA9ICJBY2N1cmFjeSBvZiBSZWNpZGl2aXNtIFByZWRpY3Rpb24gYnkgUmFjZSIpCgojdG8gdHJ5IGFuZCB1bmRlcnN0YW5kIHRoZSBncmFwaGggYWJvdmUgd2UgbG9vayBhdCBwcmVkaWN0ZWQgcmF0ZSB2cyBhY3R1YWwgcmVjaWRpdmlzbQphdmVyYWdlX3Njb3JlcyA8LSB0ZXN0X2RhdGEgJT4lCiAgZ3JvdXBfYnkocmFjZSkgJT4lCiAgc3VtbWFyaXNlKGF2Z19wcmVkaWN0ZWRfY2xhc3MgPSBtZWFuKGFzLm51bWVyaWMocHJlZGljdGVkX2NsYXNzKSwgbmEucm0gPSBUUlVFKSwKICAgICAgICAgICAgYWN0dWFsX3JlY2lkaXZpc21fcmF0ZSA9IG1lYW4oKGlzX3JlY2lkKSwgbmEucm0gPSBUUlVFKSkKCiNDb21wYXJlIHRoZXNlIGF2ZXJhZ2VzCmF2ZXJhZ2Vfc2NvcmVzCgojYW5vdmEgdGVzdAphbm92YV9yZXN1bHQgPC0gYW92KGlzX3JlY2lkIH4gcmFjZSwgZGF0YSA9IGNvbXBhc19zY29yZXNfdHdvX3llYXJzKQpzdW1tYXJ5KGFub3ZhX3Jlc3VsdCkKI3AgdmFsdWVzIGZvciBtb2RlbApzdW1tYXJ5KG1vZGVsKQoKYGBgCgpBZGQgYSBuZXcgY2h1bmsgYnkgY2xpY2tpbmcgdGhlICpJbnNlcnQgQ2h1bmsqIGJ1dHRvbiBvbiB0aGUgdG9vbGJhciBvciBieSBwcmVzc2luZyAqQ21kK09wdGlvbitJKi4KCldoZW4geW91IHNhdmUgdGhlIG5vdGVib29rLCBhbiBIVE1MIGZpbGUgY29udGFpbmluZyB0aGUgY29kZSBhbmQgb3V0cHV0IHdpbGwgYmUgc2F2ZWQgYWxvbmdzaWRlIGl0IChjbGljayB0aGUgKlByZXZpZXcqIGJ1dHRvbiBvciBwcmVzcyAqQ21kK1NoaWZ0K0sqIHRvIHByZXZpZXcgdGhlIEhUTUwgZmlsZSkuIAoKVGhlIHByZXZpZXcgc2hvd3MgeW91IGEgcmVuZGVyZWQgSFRNTCBjb3B5IG9mIHRoZSBjb250ZW50cyBvZiB0aGUgZWRpdG9yLiBDb25zZXF1ZW50bHksIHVubGlrZSAqS25pdCosICpQcmV2aWV3KiBkb2VzIG5vdCBydW4gYW55IFIgY29kZSBjaHVua3MuIEluc3RlYWQsIHRoZSBvdXRwdXQgb2YgdGhlIGNodW5rIHdoZW4gaXQgd2FzIGxhc3QgcnVuIGluIHRoZSBlZGl0b3IgaXMgZGlzcGxheWVkLgoK