library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.4 âś” readr 2.1.5
## âś” forcats 1.0.0 âś” stringr 1.5.1
## âś” ggplot2 3.5.1 âś” tibble 3.2.1
## âś” lubridate 1.9.3 âś” tidyr 1.3.1
## âś” purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
options(repos = c(CRAN = "https://cran.rstudio.com/"))
NIJ_s_Recidivism_Challenge_Full_Dataset<-read.csv("NIJ_s_Recidivism_Challenge_Full_Dataset.csv")
read_csv("NIJ_s_Recidivism_Challenge_Full_Dataset.csv")
## Rows: 25835 Columns: 54
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (22): Gender, Race, Age_at_Release, Supervision_Level_First, Education_L...
## dbl (11): ID, Residence_PUMA, Supervision_Risk_Score_First, Avg_Days_per_Dru...
## lgl (21): Gang_Affiliated, Prior_Arrest_Episodes_DVCharges, Prior_Arrest_Epi...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 25,835 Ă— 54
## ID Gender Race Age_at_Release Residence_PUMA Gang_Affiliated
## <dbl> <chr> <chr> <chr> <dbl> <lgl>
## 1 1 M BLACK 43-47 16 FALSE
## 2 2 M BLACK 33-37 16 FALSE
## 3 3 M BLACK 48 or older 24 FALSE
## 4 4 M WHITE 38-42 16 FALSE
## 5 5 M WHITE 33-37 16 FALSE
## 6 6 M WHITE 38-42 17 FALSE
## 7 7 M BLACK 48 or older 18 FALSE
## 8 8 M BLACK 38-42 16 FALSE
## 9 9 F BLACK 43-47 5 NA
## 10 10 M BLACK 43-47 16 FALSE
## # ℹ 25,825 more rows
## # ℹ 48 more variables: Supervision_Risk_Score_First <dbl>,
## # Supervision_Level_First <chr>, Education_Level <chr>, Dependents <chr>,
## # Prison_Offense <chr>, Prison_Years <chr>,
## # Prior_Arrest_Episodes_Felony <chr>, Prior_Arrest_Episodes_Misd <chr>,
## # Prior_Arrest_Episodes_Violent <chr>, Prior_Arrest_Episodes_Property <chr>,
## # Prior_Arrest_Episodes_Drug <chr>, …
my_data <- read.csv("NIJ_s_Recidivism_Challenge_Full_Dataset.csv")
model <- lm(Percent_Days_Employed ~ Age_at_Release + Supervision_Risk_Score_First + Jobs_Per_Year, data = my_data)
summary(model)
##
## Call:
## lm(formula = Percent_Days_Employed ~ Age_at_Release + Supervision_Risk_Score_First +
## Jobs_Per_Year, data = my_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9635 -0.2455 -0.1115 0.2797 0.8058
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1680346 0.0103390 16.253 < 2e-16 ***
## Age_at_Release23-27 0.0570724 0.0083333 6.849 7.63e-12 ***
## Age_at_Release28-32 0.0897878 0.0084400 10.638 < 2e-16 ***
## Age_at_Release33-37 0.1116539 0.0087448 12.768 < 2e-16 ***
## Age_at_Release38-42 0.1301109 0.0094687 13.741 < 2e-16 ***
## Age_at_Release43-47 0.1409572 0.0098145 14.362 < 2e-16 ***
## Age_at_Release48 or older 0.1344625 0.0095512 14.078 < 2e-16 ***
## Supervision_Risk_Score_First -0.0058470 0.0009671 -6.046 1.50e-09 ***
## Jobs_Per_Year 0.3418593 0.0024851 137.565 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3162 on 24548 degrees of freedom
## (1278 observations deleted due to missingness)
## Multiple R-squared: 0.4421, Adjusted R-squared: 0.4419
## F-statistic: 2432 on 8 and 24548 DF, p-value: < 2.2e-16
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
raintest(model)
##
## Rainbow test
##
## data: model
## Rain = 0.9914, df1 = 12279, df2 = 12269, p-value = 0.6838
plot(model, which = 1)
#plot is very non linear
raintest(model)
##
## Rainbow test
##
## data: model
## Rain = 0.9914, df1 = 12279, df2 = 12269, p-value = 0.6838
#THe Rain test shows that the data is not significant because it is above 0.5. Which means it’s not linear.
dwtest(model)
##
## Durbin-Watson test
##
## data: model
## DW = 1.9857, p-value = 0.1311
## alternative hypothesis: true autocorrelation is greater than 0
plot(model$fitted.values, abs(model$residuals))
abline(h = 0, col = "red")
#My plot is not normal because data is not close to the line. Hence it
looks like a little kid made it up lol.
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 1184.8, df = 8, p-value < 2.2e-16
qqnorm(model$residuals)
qqline(model$residuals, col = "blue")
# I wansn’t able to do this test because of the reasons below: I hope
the following work makes up for it. Thank you! #shapiro.test(model\(residuals)
#Error in shapiro.test(model\)residuals) : sample size must be
between 3 and 5000
install.packages("car")
##
## The downloaded binary packages are in
## /var/folders/lg/_ffr6f053hg2fjl3nc_zt9dc0000gn/T//Rtmpr48vzY/downloaded_packages
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## Age_at_Release 1.308545 6 1.022663
## Supervision_Risk_Score_First 1.302877 1 1.141436
## Jobs_Per_Year 1.005600 1 1.002796
packages <- c("lmtest", "car")
options(repos = c(CRAN = "https://cran.rstudio.com/"))
#my model violates the cor command. Here is my work below to be able to do it. One of my variables was a character and not in numerical form.
my_data <- read.csv("NIJ_s_Recidivism_Challenge_Full_Dataset.csv")
data <- read.csv("NIJ_s_Recidivism_Challenge_Full_Dataset.csv")
data_subset <- data[, c("Percent_Days_Employed", "Supervision_Risk_Score_First",
"Jobs_Per_Year", "Prior_Arrest_Episodes_Felony")]
data_subset <- na.omit(data_subset)
head(data_subset)
## Percent_Days_Employed Supervision_Risk_Score_First Jobs_Per_Year
## 1 0.4885621 3 0.4476103
## 2 0.4252336 6 2.0000000
## 3 0.0000000 7 0.0000000
## 4 1.0000000 7 0.7189961
## 5 0.2035623 4 0.9293893
## 6 0.6742520 5 0.3078382
## Prior_Arrest_Episodes_Felony
## 1 6
## 2 7
## 3 6
## 4 8
## 5 4
## 6 4
str(data_subset)
## 'data.frame': 24557 obs. of 4 variables:
## $ Percent_Days_Employed : num 0.489 0.425 0 1 0.204 ...
## $ Supervision_Risk_Score_First: int 3 6 7 7 4 5 2 5 7 5 ...
## $ Jobs_Per_Year : num 0.448 2 0 0.719 0.929 ...
## $ Prior_Arrest_Episodes_Felony: chr "6" "7" "6" "8" ...
## - attr(*, "na.action")= 'omit' Named int [1:1278] 13 57 69 72 121 142 152 176 209 238 ...
## ..- attr(*, "names")= chr [1:1278] "13" "57" "69" "72" ...
data_subset$Prior_Arrest_Episodes_Felony <- as.numeric(data_subset$Prior_Arrest_Episodes_Felony)
## Warning: NAs introduced by coercion
cor_matrix <- cor(data_subset[, c("Supervision_Risk_Score_First", "Jobs_Per_Year", "Prior_Arrest_Episodes_Felony")], use = "complete.obs")
cor(data_subset[, -1])
## Supervision_Risk_Score_First Jobs_Per_Year
## Supervision_Risk_Score_First 1.00000000 0.03324232
## Jobs_Per_Year 0.03324232 1.00000000
## Prior_Arrest_Episodes_Felony NA NA
## Prior_Arrest_Episodes_Felony
## Supervision_Risk_Score_First NA
## Jobs_Per_Year NA
## Prior_Arrest_Episodes_Felony 1
install.packages("lmtest")
##
## The downloaded binary packages are in
## /var/folders/lg/_ffr6f053hg2fjl3nc_zt9dc0000gn/T//Rtmpr48vzY/downloaded_packages
packages <- c("lmtest", "car")