library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(dplyr)
library(ggplot2)
library(pastecs)
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(readr)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(broom)

Data Set

acs <- read.csv("acs_animals.csv")

# Check Structure
str(acs)
## 'data.frame':    62235 obs. of  19 variables:
##  $ Intake.Fiscal.Year            : int  2024 2024 2024 2024 2024 2024 2024 2024 2024 2024 ...
##  $ Intake.Fiscal.Period          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Intake.Date                   : chr  "10/1/2023" "10/1/2023" "10/1/2023" "10/1/2023" ...
##  $ Intake.Section                : chr  "OVER-THE-COUNTER (OTC)" "OVER-THE-COUNTER (OTC)" "FIELD" "FIELD" ...
##  $ Intake.Subsection             : chr  "Quarantine" "Stray" "Stray" "Stray" ...
##  $ Animal.ID                     : chr  "A680497" "A685667" "A689476" "A689477" ...
##  $ Kennel.Identity               : int  545763 545750 545744 545745 545746 545747 545748 545749 545752 545753 ...
##  $ Species                       : chr  "DOG" "CAT" "DOG" "DOG" ...
##  $ Current.Sex                   : chr  "N" "N" "M" "F" ...
##  $ Council.District              : chr  "5" "10" "2" "5" ...
##  $ Zip.Code                      : int  78204 78217 78203 78225 78201 78237 78237 78233 78226 78210 ...
##  $ Animal.Size                   : chr  "MED" "SMALL" "LARGE" "LARGE" ...
##  $ Breed.Group                   : chr  "MASTIFF" "SHORTHAIR" "MASTIFF" "SETTER/RETRIEVE" ...
##  $ Primary.Color                 : chr  "WHITE" "RED" "GRAY" "BLACK" ...
##  $ Animal.Age.Group              : chr  "1 Yr - 5 Yr" "7 wks - 6 Mo" "1 Yr - 5 Yr" "1 Yr - 5 Yr" ...
##  $ Sterilization.Status.at.Intake: chr  "Sterilized at Intake" "Sterilized at Intake" "Intact at Intake" "Intact at Intake" ...
##  $ Owned.Status                  : chr  "Owner Infomation Found" "No Owner Info Found" "No Owner Info Found" "Owner Infomation Found" ...
##  $ Sterilized_DV                 : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ Stray_DV                      : int  0 1 1 0 1 1 1 0 1 0 ...
head(acs)
##   Intake.Fiscal.Year Intake.Fiscal.Period Intake.Date         Intake.Section
## 1               2024                    1   10/1/2023 OVER-THE-COUNTER (OTC)
## 2               2024                    1   10/1/2023 OVER-THE-COUNTER (OTC)
## 3               2024                    1   10/1/2023                  FIELD
## 4               2024                    1   10/1/2023                  FIELD
## 5               2024                    1   10/1/2023                  FIELD
## 6               2024                    1   10/1/2023                  FIELD
##   Intake.Subsection Animal.ID Kennel.Identity Species Current.Sex
## 1        Quarantine   A680497          545763     DOG           N
## 2             Stray   A685667          545750     CAT           N
## 3             Stray   A689476          545744     DOG           M
## 4             Stray   A689477          545745     DOG           F
## 5             Stray   A689478          545746     DOG           N
## 6             Stray   A689480          545747     DOG           M
##   Council.District Zip.Code Animal.Size     Breed.Group Primary.Color
## 1                5    78204         MED         MASTIFF         WHITE
## 2               10    78217       SMALL       SHORTHAIR           RED
## 3                2    78203       LARGE         MASTIFF          GRAY
## 4                5    78225       LARGE SETTER/RETRIEVE         BLACK
## 5                7    78201       LARGE        SHEPHERD           TAN
## 6                5    78237         MED SETTER/RETRIEVE         BLACK
##   Animal.Age.Group Sterilization.Status.at.Intake           Owned.Status
## 1      1 Yr - 5 Yr           Sterilized at Intake Owner Infomation Found
## 2     7 wks - 6 Mo           Sterilized at Intake    No Owner Info Found
## 3      1 Yr - 5 Yr               Intact at Intake    No Owner Info Found
## 4      1 Yr - 5 Yr               Intact at Intake Owner Infomation Found
## 5      1 Yr - 5 Yr               Intact at Intake    No Owner Info Found
## 6      1 Yr - 5 Yr               Intact at Intake    No Owner Info Found
##   Sterilized_DV Stray_DV
## 1             1        0
## 2             1        1
## 3             0        1
## 4             0        0
## 5             0        1
## 6             0        1
glimpse("acs_animals.csv")
##  chr "acs_animals.csv"

Variables of Interest

acs_modeldata <- acs %>%
  select(Species, Sterilized_DV, Stray_DV, Zip.Code) %>%
  filter(!is.na(Sterilized_DV),
         !is.na(Species),
         !is.na(Stray_DV),
         !is.na(Zip.Code))

Linear Model

# Model how sterilization status relates to species

model <- lm(Sterilized_DV ~ Species + Stray_DV + Zip.Code, data = acs_modeldata)

Summary of Model

summary(model)
## 
## Call:
## lm(formula = Sterilized_DV ~ Species + Stray_DV + Zip.Code, data = acs_modeldata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.26712 -0.15264 -0.04441 -0.01113  0.98986 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.310e-01  8.046e-02   4.114 3.89e-05 ***
## SpeciesDOG  -3.330e-02  2.374e-03 -14.028  < 2e-16 ***
## Stray_DV    -1.415e-01  2.200e-03 -64.346  < 2e-16 ***
## Zip.Code    -1.855e-06  1.028e-06  -1.803   0.0714 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2558 on 62231 degrees of freedom
## Multiple R-squared:  0.06277,    Adjusted R-squared:  0.06273 
## F-statistic:  1389 on 3 and 62231 DF,  p-value: < 2.2e-16

Assumption Checks

Linearity

plot(model, which = 1)   # Residuals vs Fitted Plot

raintest(model)          # Rainbow test for linearity
## 
##  Rainbow test
## 
## data:  model
## Rain = 0.98203, df1 = 31118, df2 = 31113, p-value = 0.9451

Independence of Errors

dwtest(model)            # Durbin-Watson test
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 1.511, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0

Homoscedasticity

plot(model, which = 3)   # Scale-Location Plot

bptest(model)            # Breusch-Pagan Test
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 4716.3, df = 3, p-value < 2.2e-16

Normality of Residuals

plot(model, which = 2)   # QQ Plot

No Multicollinearity

vif(model)               # Variance Inflation Factor
##  Species Stray_DV Zip.Code 
## 1.092391 1.092368 1.000026
cor(acs_modeldata %>% select(Sterilized_DV, Stray_DV, Zip.Code))
##               Sterilized_DV      Stray_DV      Zip.Code
## Sterilized_DV   1.000000000 -0.2444498826 -0.0074377490
## Stray_DV       -0.244449883  1.0000000000  0.0006778959
## Zip.Code       -0.007437749  0.0006778959  1.0000000000
acs_modeldata <- acs_modeldata %>%
mutate(log_zip = log(Zip.Code))

model2 <- lm(Sterilized_DV ~ Species + Stray_DV + log_zip,
data = acs_modeldata)

summary(model2)
## 
## Call:
## lm(formula = Sterilized_DV ~ Species + Stray_DV + log_zip, data = acs_modeldata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34028 -0.15262 -0.04439 -0.01112  0.98970 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.544606   0.470133   3.285  0.00102 ** 
## SpeciesDOG  -0.033293   0.002374 -14.026  < 2e-16 ***
## Stray_DV    -0.141515   0.002199 -64.342  < 2e-16 ***
## log_zip     -0.120586   0.041727  -2.890  0.00386 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2557 on 62231 degrees of freedom
## Multiple R-squared:  0.06285,    Adjusted R-squared:  0.0628 
## F-statistic:  1391 on 3 and 62231 DF,  p-value: < 2.2e-16
# Check again

plot(model2, which = 1)

vif(model2)
##  Species Stray_DV  log_zip 
## 1.092382 1.092377 1.000024

Findings and Assumptions

The model looked at whether species (dog vs cat), stray intake status, and zip code predict whether an animal arrives already sterilized. Species and stray status appear to influence sterilization outcomes. The assumption checks suggest that some linear model assumptions are not perfectly met, especially normality and equal spread of residuals, which is expected because sterilization is a binary variable. To improve the model, a log transformation was applied to the zip code variable. A logistic regression would be a more appropriate method for this type of outcome in a real-world analysis.