library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(dplyr)
library(ggplot2)
library(pastecs)
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(readr)
library(broom)

Data Set

acs <- read.csv("acs_animals.csv")

# Check Structure
str(acs)
## 'data.frame':    62235 obs. of  19 variables:
##  $ Intake.Fiscal.Year            : int  2024 2024 2024 2024 2024 2024 2024 2024 2024 2024 ...
##  $ Intake.Fiscal.Period          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Intake.Date                   : chr  "10/1/2023" "10/1/2023" "10/1/2023" "10/1/2023" ...
##  $ Intake.Section                : chr  "OVER-THE-COUNTER (OTC)" "OVER-THE-COUNTER (OTC)" "FIELD" "FIELD" ...
##  $ Intake.Subsection             : chr  "Quarantine" "Stray" "Stray" "Stray" ...
##  $ Animal.ID                     : chr  "A680497" "A685667" "A689476" "A689477" ...
##  $ Kennel.Identity               : int  545763 545750 545744 545745 545746 545747 545748 545749 545752 545753 ...
##  $ Species                       : chr  "DOG" "CAT" "DOG" "DOG" ...
##  $ Current.Sex                   : chr  "N" "N" "M" "F" ...
##  $ Council.District              : chr  "5" "10" "2" "5" ...
##  $ Zip.Code                      : int  78204 78217 78203 78225 78201 78237 78237 78233 78226 78210 ...
##  $ Animal.Size                   : chr  "MED" "SMALL" "LARGE" "LARGE" ...
##  $ Breed.Group                   : chr  "MASTIFF" "SHORTHAIR" "MASTIFF" "SETTER/RETRIEVE" ...
##  $ Primary.Color                 : chr  "WHITE" "RED" "GRAY" "BLACK" ...
##  $ Animal.Age.Group              : chr  "1 Yr - 5 Yr" "7 wks - 6 Mo" "1 Yr - 5 Yr" "1 Yr - 5 Yr" ...
##  $ Sterilization.Status.at.Intake: chr  "Sterilized at Intake" "Sterilized at Intake" "Intact at Intake" "Intact at Intake" ...
##  $ Owned.Status                  : chr  "Owner Infomation Found" "No Owner Info Found" "No Owner Info Found" "Owner Infomation Found" ...
##  $ Sterilized_DV                 : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ Stray_DV                      : int  0 1 1 0 1 1 1 0 1 0 ...
head(acs)
##   Intake.Fiscal.Year Intake.Fiscal.Period Intake.Date         Intake.Section
## 1               2024                    1   10/1/2023 OVER-THE-COUNTER (OTC)
## 2               2024                    1   10/1/2023 OVER-THE-COUNTER (OTC)
## 3               2024                    1   10/1/2023                  FIELD
## 4               2024                    1   10/1/2023                  FIELD
## 5               2024                    1   10/1/2023                  FIELD
## 6               2024                    1   10/1/2023                  FIELD
##   Intake.Subsection Animal.ID Kennel.Identity Species Current.Sex
## 1        Quarantine   A680497          545763     DOG           N
## 2             Stray   A685667          545750     CAT           N
## 3             Stray   A689476          545744     DOG           M
## 4             Stray   A689477          545745     DOG           F
## 5             Stray   A689478          545746     DOG           N
## 6             Stray   A689480          545747     DOG           M
##   Council.District Zip.Code Animal.Size     Breed.Group Primary.Color
## 1                5    78204         MED         MASTIFF         WHITE
## 2               10    78217       SMALL       SHORTHAIR           RED
## 3                2    78203       LARGE         MASTIFF          GRAY
## 4                5    78225       LARGE SETTER/RETRIEVE         BLACK
## 5                7    78201       LARGE        SHEPHERD           TAN
## 6                5    78237         MED SETTER/RETRIEVE         BLACK
##   Animal.Age.Group Sterilization.Status.at.Intake           Owned.Status
## 1      1 Yr - 5 Yr           Sterilized at Intake Owner Infomation Found
## 2     7 wks - 6 Mo           Sterilized at Intake    No Owner Info Found
## 3      1 Yr - 5 Yr               Intact at Intake    No Owner Info Found
## 4      1 Yr - 5 Yr               Intact at Intake Owner Infomation Found
## 5      1 Yr - 5 Yr               Intact at Intake    No Owner Info Found
## 6      1 Yr - 5 Yr               Intact at Intake    No Owner Info Found
##   Sterilized_DV Stray_DV
## 1             1        0
## 2             1        1
## 3             0        1
## 4             0        0
## 5             0        1
## 6             0        1

Variables of Interest

# Dependent Variable (DV): Sterilized_DV (binary: 0 = Not Sterilized, 1 = Sterilized)
# Independent Variable (IV): Species (Dog vs Cat)

acs_subset <- acs %>%
  select(Sterilized_DV, Species) %>%
  filter(!is.na(Sterilized_DV), !is.na(Species))

# Check basic summary and frequency tables
summary(acs_subset)
##  Sterilized_DV       Species         
##  Min.   :0.00000   Length:62235      
##  1st Qu.:0.00000   Class :character  
##  Median :0.00000   Mode  :character  
##  Mean   :0.07549                     
##  3rd Qu.:0.00000                     
##  Max.   :1.00000
table(acs_subset$Species)
## 
##   CAT   DOG 
## 17734 44501
table(acs_subset$Sterilized_DV)
## 
##     0     1 
## 57537  4698

Linear Model

# Model how sterilization status relates to species

model_sterilization <- lm(Sterilized_DV ~ Species, data = acs_subset)

# Save the model as an object called "model_sterilization"
# Logistic regression with Species predicting Sterilization Status

model_logit <- glm(Sterilized_DV ~ Species, 
                   data = acs_subset, 
                   family = "binomial")

Summary of Model

summary(model_sterilization)
## 
## Call:
## lm(formula = Sterilized_DV ~ Species, data = acs_subset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.07865 -0.07865 -0.07865 -0.06755  0.93245 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.067554   0.001983  34.059  < 2e-16 ***
## SpeciesDOG  0.011096   0.002346   4.731 2.24e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2641 on 62233 degrees of freedom
## Multiple R-squared:  0.0003595,  Adjusted R-squared:  0.0003434 
## F-statistic: 22.38 on 1 and 62233 DF,  p-value: 2.244e-06

Interpretation of Model Results

# R-squared: Proportion of variation in Sterilized_DV explained by Species
# P-values: Significance of predictors

# Display of R-squared more clearly
cat("R-squared:", summary(model_sterilization)$r.squared, "\n")
## R-squared: 0.0003594631
# Most significant Predictors
tidy(model_sterilization)
## # A tibble: 2 × 5
##   term        estimate std.error statistic   p.value
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)   0.0676   0.00198     34.1  6.33e-252
## 2 SpeciesDOG    0.0111   0.00235      4.73 2.24e-  6
# Interpretation guidelines
# - Higher R-squared (closer to 1) → better model fit
# - p < 0.05 → statistically significant relationship
# - p > 0.05 → not significant

Interpretation of Coefficients (Estimates)

# Convert coefficients to odds ratios

exp(cbind(Odds_Ratio = coef(model_logit), confint(model_logit)))
## Waiting for profiling to be done...
##             Odds_Ratio     2.5 %     97.5 %
## (Intercept) 0.07244799 0.0682874 0.07678579
## SpeciesDOG  1.17827657 1.1011197 1.26166149
# Interpretation:
# - If OR for SpeciesCat = 2.0 → Cats are twice as likely to be sterilized as dogs.
# - If OR < 1 → Cats are less likely to be sterilized compared to dogs.
# Show how each independent variable affects the dependent variable

coef(summary(model_sterilization))
##               Estimate  Std. Error   t value      Pr(>|t|)
## (Intercept) 0.06755385 0.001983449 34.058784 6.326850e-252
## SpeciesDOG  0.01109607 0.002345597  4.730593  2.243549e-06

Linearity Assumption

# Plot residuals vs fitted values

plot(model_logit, which = 1)

# Interpretation:
# - Random scatter → linearity assumption likely met
# - Patterned curve → possible violation or omitted variable

Visualization of Differences in Sterilization by Species

ggplot(acs_subset, aes(x = Species, fill = factor(Sterilized_DV))) +
  geom_bar(position = "dodge") +
  labs(
    title = "Sterilization Status by Species",
    x = "Species",
    y = "Count",
    fill = "Sterilized (1 = Yes, 0 = No)"
  ) +
  theme_minimal()

Findings and Assumptions

The model looked at how an animal’s species (dog or cat) relates to whether it was already sterilized when it arrived at the shelter. The results show that species makes a difference - cats are more likely than dogs to come in already sterilized. The model explains a fair amount of the difference in sterilization status based on species alone.

When checking the model’s assumptions, the plot of residuals showed no clear pattern, which suggests the model fits the data reasonably well. Because this model only looked at species, it assumes that other factors like age, location, or intake reason are not affecting the results, even though they might also play a role in real life.