library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(pastecs)
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(readr)
CPI_3_3 <- read_csv("CPI 3.3.csv")
## Rows: 14774 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): County, Region, Disposition, Family Violence Indicated
## dbl (1): Fiscal Year
## num (1): Completed Investigations
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
CPI_3_3$Disposition<- as.factor(CPI_3_3$Disposition)

CPI_3_3$`Family Violence Indicated` <- as.factor(CPI_3_3$`Family Violence Indicated`) 

CPI_3_3$`Completed Investigations` <- as.numeric(CPI_3_3$`Completed Investigations`)
library(dplyr)
CPI_3_3$RTB <- ifelse(CPI_3_3$Disposition == "Reason to Believe", 1, 0)
CPI_3_3$DV <- ifelse(CPI_3_3$`Family Violence Indicated` == "Y", 1, 0)
model <- lm(RTB ~ `DV`, data = CPI_3_3, weights = `Completed Investigations`)
summary(model)
## 
## Call:
## lm(formula = RTB ~ DV, data = CPI_3_3, weights = `Completed Investigations`)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.182  -1.360  -0.555   1.090  51.372 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.198090   0.003619   54.74   <2e-16 ***
## DV          0.356993   0.010325   34.58   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.285 on 14772 degrees of freedom
## Multiple R-squared:  0.07487,    Adjusted R-squared:  0.07481 
## F-statistic:  1196 on 1 and 14772 DF,  p-value: < 2.2e-16
#The family violence variable "DV" is significant based in the very small p value < 2.2e-16. There is more of a chance that family violence indicated would lead to a disposition of reason to believe. 
#The model explains roughly 7.4% of what leads to a Reason to Believe disposition. Other factors could include substance use, mental health, family history with CPS, region, and caseworker bias. 
plot(model, which = 1)

#Based on this chart, I would say that this is non-linear. The data does not seem to fit well with a linear model.