library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(pastecs)
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(readr)
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
CPI_3_3 <- read_csv("CPI 3.3.csv")
## Rows: 14774 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): County, Region, Disposition, Family Violence Indicated
## dbl (1): Fiscal Year
## num (1): Completed Investigations
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
CPI_3_3$Disposition<- as.factor(CPI_3_3$Disposition)

CPI_3_3$`Family Violence Indicated` <- as.factor(CPI_3_3$`Family Violence Indicated`) 

CPI_3_3$`Completed Investigations` <- as.numeric(CPI_3_3$`Completed Investigations`)
library(dplyr)
CPI_3_3$DV <- ifelse(CPI_3_3$`Family Violence Indicated` == "Y", 1, 0)

CPI_3_3$RTB <- ifelse(CPI_3_3$Disposition == "Reason to Believe", 1, 0)

CPI_3_3$Region <- as.factor(CPI_3_3$Region)
log_model <- glm(RTB ~ DV + Region, 
                 data = CPI_3_3, 
                 family = binomial)

summary(log_model)
## 
## Call:
## glm(formula = RTB ~ DV + Region, family = binomial, data = CPI_3_3)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -0.817579   0.049554 -16.499  < 2e-16 ***
## DV                   0.335274   0.035838   9.355  < 2e-16 ***
## Region10-El Paso     0.123674   0.133631   0.925 0.354712    
## Region11-Edinburg   -0.179309   0.080279  -2.234 0.025511 *  
## Region2-Abilene      0.006429   0.071531   0.090 0.928389    
## Region3-Arlington   -0.325618   0.076753  -4.242 2.21e-05 ***
## Region4-Tyler       -0.179772   0.073616  -2.442 0.014606 *  
## Region5-Beaumont    -0.145491   0.084523  -1.721 0.085192 .  
## Region6-Houston     -0.294759   0.087479  -3.369 0.000753 ***
## Region7-Austin      -0.095785   0.068865  -1.391 0.164249    
## Region8-San Antonio -0.100099   0.070717  -1.415 0.156927    
## Region9-Midland      0.001482   0.074164   0.020 0.984055    
## RegionOut of State  -0.248763   0.308265  -0.807 0.419680    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 18390  on 14773  degrees of freedom
## Residual deviance: 18268  on 14761  degrees of freedom
## AIC: 18294
## 
## Number of Fisher Scoring iterations: 4
exp(coef(log_model))
##         (Intercept)                  DV    Region10-El Paso   Region11-Edinburg 
##           0.4414990           1.3983238           1.1316473           0.8358480 
##     Region2-Abilene   Region3-Arlington       Region4-Tyler    Region5-Beaumont 
##           1.0064493           0.7220806           0.8354611           0.8645978 
##     Region6-Houston      Region7-Austin Region8-San Antonio     Region9-Midland 
##           0.7447113           0.9086590           0.9047481           1.0014833 
##  RegionOut of State 
##           0.7797648
prop.table(table(CPI_3_3$DV, CPI_3_3$RTB), 1)
##    
##             0         1
##   0 0.7154920 0.2845080
##   1 0.6454298 0.3545702
chisq.test(table(CPI_3_3$DV, CPI_3_3$RTB))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(CPI_3_3$DV, CPI_3_3$RTB)
## X-squared = 81.821, df = 1, p-value < 2.2e-16
library(ggplot2)

ggplot(CPI_3_3, aes(x = factor(DV), y = RTB, weight = `Completed Investigations`)) +
  stat_summary(fun = "mean", geom = "bar") +
  labs(x = "Family Violence (0 = No, 1 = Yes)",
       y = "Proportion of RTB",
       title = "Figure 1: RTB Rates by Family Violence")

ggplot(CPI_3_3, aes(x = Region, y = RTB, fill = factor(DV),
                    weight = `Completed Investigations`)) +
  stat_summary(fun = "mean", geom = "bar", position = "dodge") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(fill = "DV",
       title = "Figure 2: RTB by Region and Family Violence")