library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(dplyr)
library(ggplot2)
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
library(readr)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(broom)
acs <- read.csv("acs_animals.csv")
# Check Structure
str(acs)
## 'data.frame': 62235 obs. of 19 variables:
## $ Intake.Fiscal.Year : int 2024 2024 2024 2024 2024 2024 2024 2024 2024 2024 ...
## $ Intake.Fiscal.Period : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Intake.Date : chr "10/1/2023" "10/1/2023" "10/1/2023" "10/1/2023" ...
## $ Intake.Section : chr "OVER-THE-COUNTER (OTC)" "OVER-THE-COUNTER (OTC)" "FIELD" "FIELD" ...
## $ Intake.Subsection : chr "Quarantine" "Stray" "Stray" "Stray" ...
## $ Animal.ID : chr "A680497" "A685667" "A689476" "A689477" ...
## $ Kennel.Identity : int 545763 545750 545744 545745 545746 545747 545748 545749 545752 545753 ...
## $ Species : chr "DOG" "CAT" "DOG" "DOG" ...
## $ Current.Sex : chr "N" "N" "M" "F" ...
## $ Council.District : chr "5" "10" "2" "5" ...
## $ Zip.Code : int 78204 78217 78203 78225 78201 78237 78237 78233 78226 78210 ...
## $ Animal.Size : chr "MED" "SMALL" "LARGE" "LARGE" ...
## $ Breed.Group : chr "MASTIFF" "SHORTHAIR" "MASTIFF" "SETTER/RETRIEVE" ...
## $ Primary.Color : chr "WHITE" "RED" "GRAY" "BLACK" ...
## $ Animal.Age.Group : chr "1 Yr - 5 Yr" "7 wks - 6 Mo" "1 Yr - 5 Yr" "1 Yr - 5 Yr" ...
## $ Sterilization.Status.at.Intake: chr "Sterilized at Intake" "Sterilized at Intake" "Intact at Intake" "Intact at Intake" ...
## $ Owned.Status : chr "Owner Infomation Found" "No Owner Info Found" "No Owner Info Found" "Owner Infomation Found" ...
## $ Sterilized_DV : int 1 1 0 0 0 0 0 0 0 0 ...
## $ Stray_DV : int 0 1 1 0 1 1 1 0 1 0 ...
head(acs)
## Intake.Fiscal.Year Intake.Fiscal.Period Intake.Date Intake.Section
## 1 2024 1 10/1/2023 OVER-THE-COUNTER (OTC)
## 2 2024 1 10/1/2023 OVER-THE-COUNTER (OTC)
## 3 2024 1 10/1/2023 FIELD
## 4 2024 1 10/1/2023 FIELD
## 5 2024 1 10/1/2023 FIELD
## 6 2024 1 10/1/2023 FIELD
## Intake.Subsection Animal.ID Kennel.Identity Species Current.Sex
## 1 Quarantine A680497 545763 DOG N
## 2 Stray A685667 545750 CAT N
## 3 Stray A689476 545744 DOG M
## 4 Stray A689477 545745 DOG F
## 5 Stray A689478 545746 DOG N
## 6 Stray A689480 545747 DOG M
## Council.District Zip.Code Animal.Size Breed.Group Primary.Color
## 1 5 78204 MED MASTIFF WHITE
## 2 10 78217 SMALL SHORTHAIR RED
## 3 2 78203 LARGE MASTIFF GRAY
## 4 5 78225 LARGE SETTER/RETRIEVE BLACK
## 5 7 78201 LARGE SHEPHERD TAN
## 6 5 78237 MED SETTER/RETRIEVE BLACK
## Animal.Age.Group Sterilization.Status.at.Intake Owned.Status
## 1 1 Yr - 5 Yr Sterilized at Intake Owner Infomation Found
## 2 7 wks - 6 Mo Sterilized at Intake No Owner Info Found
## 3 1 Yr - 5 Yr Intact at Intake No Owner Info Found
## 4 1 Yr - 5 Yr Intact at Intake Owner Infomation Found
## 5 1 Yr - 5 Yr Intact at Intake No Owner Info Found
## 6 1 Yr - 5 Yr Intact at Intake No Owner Info Found
## Sterilized_DV Stray_DV
## 1 1 0
## 2 1 1
## 3 0 1
## 4 0 0
## 5 0 1
## 6 0 1
glimpse("acs_animals.csv")
## chr "acs_animals.csv"
acs_modeldata <- acs %>%
select(Species, Sterilized_DV, Stray_DV, Zip.Code) %>%
filter(!is.na(Sterilized_DV),
!is.na(Species),
!is.na(Stray_DV),
!is.na(Zip.Code))
# Model how sterilization status relates to species
model <- lm(Sterilized_DV ~ Species + Stray_DV + Zip.Code, data = acs_modeldata)
summary(model)
##
## Call:
## lm(formula = Sterilized_DV ~ Species + Stray_DV + Zip.Code, data = acs_modeldata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.26712 -0.15264 -0.04441 -0.01113 0.98986
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.310e-01 8.046e-02 4.114 3.89e-05 ***
## SpeciesDOG -3.330e-02 2.374e-03 -14.028 < 2e-16 ***
## Stray_DV -1.415e-01 2.200e-03 -64.346 < 2e-16 ***
## Zip.Code -1.855e-06 1.028e-06 -1.803 0.0714 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2558 on 62231 degrees of freedom
## Multiple R-squared: 0.06277, Adjusted R-squared: 0.06273
## F-statistic: 1389 on 3 and 62231 DF, p-value: < 2.2e-16
Linearity
plot(model, which = 1) # Residuals vs Fitted Plot
raintest(model) # Rainbow test for linearity
##
## Rainbow test
##
## data: model
## Rain = 0.98203, df1 = 31118, df2 = 31113, p-value = 0.9451
Independence of Errors
dwtest(model) # Durbin-Watson test
##
## Durbin-Watson test
##
## data: model
## DW = 1.511, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
Homoscedasticity
plot(model, which = 3) # Scale-Location Plot
bptest(model) # Breusch-Pagan Test
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 4716.3, df = 3, p-value < 2.2e-16
Normality of Residuals
plot(model, which = 2) # QQ Plot
No Multicollinearity
vif(model) # Variance Inflation Factor
## Species Stray_DV Zip.Code
## 1.092391 1.092368 1.000026
cor(acs_modeldata %>% select(Sterilized_DV, Stray_DV, Zip.Code))
## Sterilized_DV Stray_DV Zip.Code
## Sterilized_DV 1.000000000 -0.2444498826 -0.0074377490
## Stray_DV -0.244449883 1.0000000000 0.0006778959
## Zip.Code -0.007437749 0.0006778959 1.0000000000
acs_modeldata <- acs_modeldata %>%
mutate(log_zip = log(Zip.Code))
model2 <- lm(Sterilized_DV ~ Species + Stray_DV + log_zip,
data = acs_modeldata)
summary(model2)
##
## Call:
## lm(formula = Sterilized_DV ~ Species + Stray_DV + log_zip, data = acs_modeldata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.34028 -0.15262 -0.04439 -0.01112 0.98970
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.544606 0.470133 3.285 0.00102 **
## SpeciesDOG -0.033293 0.002374 -14.026 < 2e-16 ***
## Stray_DV -0.141515 0.002199 -64.342 < 2e-16 ***
## log_zip -0.120586 0.041727 -2.890 0.00386 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2557 on 62231 degrees of freedom
## Multiple R-squared: 0.06285, Adjusted R-squared: 0.0628
## F-statistic: 1391 on 3 and 62231 DF, p-value: < 2.2e-16
# Check again
plot(model2, which = 1)
vif(model2)
## Species Stray_DV log_zip
## 1.092382 1.092377 1.000024
The model looked at whether species (dog vs cat), stray intake status, and zip code predict whether an animal arrives already sterilized. Species and stray status appear to influence sterilization outcomes. The assumption checks suggest that some linear model assumptions are not perfectly met, especially normality and equal spread of residuals, which is expected because sterilization is a binary variable. To improve the model, a log transformation was applied to the zip code variable. A logistic regression would be a more appropriate method for this type of outcome in a real-world analysis.