library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(readr)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
CPI_3_3 <- read_csv("CPI 3.3.csv")
## Rows: 14774 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): County, Region, Disposition, Family Violence Indicated
## dbl (1): Fiscal Year
## num (1): Completed Investigations
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
CPI_3_3$Disposition<- as.factor(CPI_3_3$Disposition)
CPI_3_3$`Family Violence Indicated` <- as.factor(CPI_3_3$`Family Violence Indicated`)
CPI_3_3$`Completed Investigations` <- as.numeric(CPI_3_3$`Completed Investigations`)
library(dplyr)
CPI_3_3$RTB <- ifelse(CPI_3_3$Disposition == "Reason to Believe", 1, 0)
CPI_3_3$DV <- ifelse(CPI_3_3$`Family Violence Indicated` == "Y", 1, 0)
model <- lm(RTB ~ `DV`, data = CPI_3_3, weights = `Completed Investigations`)
summary(model)
##
## Call:
## lm(formula = RTB ~ DV, data = CPI_3_3, weights = `Completed Investigations`)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -26.182 -1.360 -0.555 1.090 51.372
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.198090 0.003619 54.74 <2e-16 ***
## DV 0.356993 0.010325 34.58 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.285 on 14772 degrees of freedom
## Multiple R-squared: 0.07487, Adjusted R-squared: 0.07481
## F-statistic: 1196 on 1 and 14772 DF, p-value: < 2.2e-16
plot(CPI_3_3$DV, CPI_3_3$RTB,
xlab = "Family Violence (DV)",
ylab = "RTB",
main = "Linearity Check")
abline(lm(RTB ~ DV, data = CPI_3_3), col = "blue")

#The plot is difficult to read since my data is binary, but there is a positive relationship between DV and RTB.
raintest(model)
##
## Rainbow test
##
## data: model
## Rain = 0.99094, df1 = 7387, df2 = 7385, p-value = 0.6521
#Results are not significant (p is more than .05)
durbinWatsonTest(model)
## lag Autocorrelation D-W Statistic p-value
## 1 0.1300966 1.739611 0
## Alternative hypothesis: rho != 0
#with a p-value less than 0.05, the model likely contains some correlation of residuals
plot(model$fitted.values, model$residuals,
xlab = "Fitted Values",
ylab = "Residuals",
main = "Homoscedasticity Check")
abline(h = 0, col = "green")

bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 1610172, df = 1, p-value < 2.2e-16
#results were significant (p < .001), the assumption of homoscedasticity was violated.
qqnorm(model$residuals)
qqline(model$residuals, col = "orange")

#shapiro.test(model$residuals)
# Data is too big for the shapiro test
#
#cor(CPI_3_3[, c("DV", "County")])
#vif(model)
#Multicollinearity and correlation tests were not conducted because the model includes only one independent variable.
#4 Some of them, but not all. Linearity was fine, but the other ones had some issues. That’s honestly expected though because this kind of data (0/1 outcome) doesn’t work perfectly with linear regression.
#5 Homoscedasticity and normality were definitely violated, and there was also a small issue with independence of errors.
#6 I’d use logistic regression instead, since this outcome is binary. Linear regression isn’t really made for that, so switching models would fix most of these issues.
log_model <- glm(RTB ~ DV, data = CPI_3_3, family = binomial)
summary(log_model)
##
## Call:
## glm(formula = RTB ~ DV, family = binomial, data = CPI_3_3)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.92221 0.02400 -38.422 <2e-16 ***
## DV 0.32320 0.03572 9.049 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 18390 on 14773 degrees of freedom
## Residual deviance: 18308 on 14772 degrees of freedom
## AIC: 18312
##
## Number of Fisher Scoring iterations: 4
Family violence makes it more likely that a case will result in RTB,
and this relationship is statistically significant.