library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(pastecs)
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(readr)
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
CPI_3_3 <- read_csv("CPI 3.3.csv")
## Rows: 14774 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): County, Region, Disposition, Family Violence Indicated
## dbl (1): Fiscal Year
## num (1): Completed Investigations
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
CPI_3_3$Disposition<- as.factor(CPI_3_3$Disposition)

CPI_3_3$`Family Violence Indicated` <- as.factor(CPI_3_3$`Family Violence Indicated`) 

CPI_3_3$`Completed Investigations` <- as.numeric(CPI_3_3$`Completed Investigations`)
library(dplyr)
CPI_3_3$RTB <- ifelse(CPI_3_3$Disposition == "Reason to Believe", 1, 0)
CPI_3_3$DV <- ifelse(CPI_3_3$`Family Violence Indicated` == "Y", 1, 0)

model <- lm(RTB ~ `DV`, data = CPI_3_3, weights = `Completed Investigations`)
summary(model)
## 
## Call:
## lm(formula = RTB ~ DV, data = CPI_3_3, weights = `Completed Investigations`)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.182  -1.360  -0.555   1.090  51.372 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.198090   0.003619   54.74   <2e-16 ***
## DV          0.356993   0.010325   34.58   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.285 on 14772 degrees of freedom
## Multiple R-squared:  0.07487,    Adjusted R-squared:  0.07481 
## F-statistic:  1196 on 1 and 14772 DF,  p-value: < 2.2e-16
plot(CPI_3_3$DV, CPI_3_3$RTB,
     xlab = "Family Violence (DV)",
     ylab = "RTB",
     main = "Linearity Check")

abline(lm(RTB ~ DV, data = CPI_3_3), col = "blue")

#The plot is difficult to read since my data is binary, but there is a positive relationship between DV and RTB.

raintest(model)
## 
##  Rainbow test
## 
## data:  model
## Rain = 0.99094, df1 = 7387, df2 = 7385, p-value = 0.6521
#Results are not significant (p is more than .05)
durbinWatsonTest(model)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.1300966      1.739611       0
##  Alternative hypothesis: rho != 0
#with a p-value less than 0.05, the model likely contains some correlation of residuals
plot(model$fitted.values, model$residuals,
     xlab = "Fitted Values",
     ylab = "Residuals",
     main = "Homoscedasticity Check")

abline(h = 0, col = "green")

bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 1610172, df = 1, p-value < 2.2e-16
#results were significant (p < .001), the assumption of homoscedasticity was violated.
qqnorm(model$residuals)
qqline(model$residuals, col = "orange")

#shapiro.test(model$residuals)

# Data is too big for the shapiro test
#
#cor(CPI_3_3[, c("DV", "County")])

#vif(model)

#Multicollinearity and correlation tests were not conducted because the model includes only one independent variable.

#4 Some of them, but not all. Linearity was fine, but the other ones had some issues. That’s honestly expected though because this kind of data (0/1 outcome) doesn’t work perfectly with linear regression.

#5 Homoscedasticity and normality were definitely violated, and there was also a small issue with independence of errors.

#6 I’d use logistic regression instead, since this outcome is binary. Linear regression isn’t really made for that, so switching models would fix most of these issues.
log_model <- glm(RTB ~ DV, data = CPI_3_3, family = binomial)

summary(log_model)
## 
## Call:
## glm(formula = RTB ~ DV, family = binomial, data = CPI_3_3)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.92221    0.02400 -38.422   <2e-16 ***
## DV           0.32320    0.03572   9.049   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 18390  on 14773  degrees of freedom
## Residual deviance: 18308  on 14772  degrees of freedom
## AIC: 18312
## 
## Number of Fisher Scoring iterations: 4

Family violence makes it more likely that a case will result in RTB, and this relationship is statistically significant.