library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(dplyr)
library(ggplot2)
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
library(readr)
library(broom)
acs <- read.csv("acs_animals.csv")
# Check Structure
str(acs)
## 'data.frame': 62235 obs. of 19 variables:
## $ Intake.Fiscal.Year : int 2024 2024 2024 2024 2024 2024 2024 2024 2024 2024 ...
## $ Intake.Fiscal.Period : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Intake.Date : chr "10/1/2023" "10/1/2023" "10/1/2023" "10/1/2023" ...
## $ Intake.Section : chr "OVER-THE-COUNTER (OTC)" "OVER-THE-COUNTER (OTC)" "FIELD" "FIELD" ...
## $ Intake.Subsection : chr "Quarantine" "Stray" "Stray" "Stray" ...
## $ Animal.ID : chr "A680497" "A685667" "A689476" "A689477" ...
## $ Kennel.Identity : int 545763 545750 545744 545745 545746 545747 545748 545749 545752 545753 ...
## $ Species : chr "DOG" "CAT" "DOG" "DOG" ...
## $ Current.Sex : chr "N" "N" "M" "F" ...
## $ Council.District : chr "5" "10" "2" "5" ...
## $ Zip.Code : int 78204 78217 78203 78225 78201 78237 78237 78233 78226 78210 ...
## $ Animal.Size : chr "MED" "SMALL" "LARGE" "LARGE" ...
## $ Breed.Group : chr "MASTIFF" "SHORTHAIR" "MASTIFF" "SETTER/RETRIEVE" ...
## $ Primary.Color : chr "WHITE" "RED" "GRAY" "BLACK" ...
## $ Animal.Age.Group : chr "1 Yr - 5 Yr" "7 wks - 6 Mo" "1 Yr - 5 Yr" "1 Yr - 5 Yr" ...
## $ Sterilization.Status.at.Intake: chr "Sterilized at Intake" "Sterilized at Intake" "Intact at Intake" "Intact at Intake" ...
## $ Owned.Status : chr "Owner Infomation Found" "No Owner Info Found" "No Owner Info Found" "Owner Infomation Found" ...
## $ Sterilized_DV : int 1 1 0 0 0 0 0 0 0 0 ...
## $ Stray_DV : int 0 1 1 0 1 1 1 0 1 0 ...
head(acs)
## Intake.Fiscal.Year Intake.Fiscal.Period Intake.Date Intake.Section
## 1 2024 1 10/1/2023 OVER-THE-COUNTER (OTC)
## 2 2024 1 10/1/2023 OVER-THE-COUNTER (OTC)
## 3 2024 1 10/1/2023 FIELD
## 4 2024 1 10/1/2023 FIELD
## 5 2024 1 10/1/2023 FIELD
## 6 2024 1 10/1/2023 FIELD
## Intake.Subsection Animal.ID Kennel.Identity Species Current.Sex
## 1 Quarantine A680497 545763 DOG N
## 2 Stray A685667 545750 CAT N
## 3 Stray A689476 545744 DOG M
## 4 Stray A689477 545745 DOG F
## 5 Stray A689478 545746 DOG N
## 6 Stray A689480 545747 DOG M
## Council.District Zip.Code Animal.Size Breed.Group Primary.Color
## 1 5 78204 MED MASTIFF WHITE
## 2 10 78217 SMALL SHORTHAIR RED
## 3 2 78203 LARGE MASTIFF GRAY
## 4 5 78225 LARGE SETTER/RETRIEVE BLACK
## 5 7 78201 LARGE SHEPHERD TAN
## 6 5 78237 MED SETTER/RETRIEVE BLACK
## Animal.Age.Group Sterilization.Status.at.Intake Owned.Status
## 1 1 Yr - 5 Yr Sterilized at Intake Owner Infomation Found
## 2 7 wks - 6 Mo Sterilized at Intake No Owner Info Found
## 3 1 Yr - 5 Yr Intact at Intake No Owner Info Found
## 4 1 Yr - 5 Yr Intact at Intake Owner Infomation Found
## 5 1 Yr - 5 Yr Intact at Intake No Owner Info Found
## 6 1 Yr - 5 Yr Intact at Intake No Owner Info Found
## Sterilized_DV Stray_DV
## 1 1 0
## 2 1 1
## 3 0 1
## 4 0 0
## 5 0 1
## 6 0 1
# Dependent Variable (DV): Sterilized_DV (binary: 0 = Not Sterilized, 1 = Sterilized)
# Independent Variable (IV): Species (Dog vs Cat)
acs_subset <- acs %>%
select(Sterilized_DV, Species) %>%
filter(!is.na(Sterilized_DV), !is.na(Species))
# Check basic summary and frequency tables
summary(acs_subset)
## Sterilized_DV Species
## Min. :0.00000 Length:62235
## 1st Qu.:0.00000 Class :character
## Median :0.00000 Mode :character
## Mean :0.07549
## 3rd Qu.:0.00000
## Max. :1.00000
table(acs_subset$Species)
##
## CAT DOG
## 17734 44501
table(acs_subset$Sterilized_DV)
##
## 0 1
## 57537 4698
# Model how sterilization status relates to species
model_sterilization <- lm(Sterilized_DV ~ Species, data = acs_subset)
# Save the model as an object called "model_sterilization"
# Logistic regression with Species predicting Sterilization Status
model_logit <- glm(Sterilized_DV ~ Species,
data = acs_subset,
family = "binomial")
summary(model_sterilization)
##
## Call:
## lm(formula = Sterilized_DV ~ Species, data = acs_subset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.07865 -0.07865 -0.07865 -0.06755 0.93245
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.067554 0.001983 34.059 < 2e-16 ***
## SpeciesDOG 0.011096 0.002346 4.731 2.24e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2641 on 62233 degrees of freedom
## Multiple R-squared: 0.0003595, Adjusted R-squared: 0.0003434
## F-statistic: 22.38 on 1 and 62233 DF, p-value: 2.244e-06
# R-squared: Proportion of variation in Sterilized_DV explained by Species
# P-values: Significance of predictors
# Display of R-squared more clearly
cat("R-squared:", summary(model_sterilization)$r.squared, "\n")
## R-squared: 0.0003594631
# Most significant Predictors
tidy(model_sterilization)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.0676 0.00198 34.1 6.33e-252
## 2 SpeciesDOG 0.0111 0.00235 4.73 2.24e- 6
# Interpretation guidelines
# - Higher R-squared (closer to 1) → better model fit
# - p < 0.05 → statistically significant relationship
# - p > 0.05 → not significant
# Convert coefficients to odds ratios
exp(cbind(Odds_Ratio = coef(model_logit), confint(model_logit)))
## Waiting for profiling to be done...
## Odds_Ratio 2.5 % 97.5 %
## (Intercept) 0.07244799 0.0682874 0.07678579
## SpeciesDOG 1.17827657 1.1011197 1.26166149
# Interpretation:
# - If OR for SpeciesCat = 2.0 → Cats are twice as likely to be sterilized as dogs.
# - If OR < 1 → Cats are less likely to be sterilized compared to dogs.
# Show how each independent variable affects the dependent variable
coef(summary(model_sterilization))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.06755385 0.001983449 34.058784 6.326850e-252
## SpeciesDOG 0.01109607 0.002345597 4.730593 2.243549e-06
# Plot residuals vs fitted values
plot(model_logit, which = 1)
# Interpretation:
# - Random scatter → linearity assumption likely met
# - Patterned curve → possible violation or omitted variable
ggplot(acs_subset, aes(x = Species, fill = factor(Sterilized_DV))) +
geom_bar(position = "dodge") +
labs(
title = "Sterilization Status by Species",
x = "Species",
y = "Count",
fill = "Sterilized (1 = Yes, 0 = No)"
) +
theme_minimal()
The model looked at how an animal’s species (dog or cat) relates to whether it was already sterilized when it arrived at the shelter. The results show that species makes a difference - cats are more likely than dogs to come in already sterilized. The model explains a fair amount of the difference in sterilization status based on species alone.
When checking the model’s assumptions, the plot of residuals showed no clear pattern, which suggests the model fits the data reasonably well. Because this model only looked at species, it assumes that other factors like age, location, or intake reason are not affecting the results, even though they might also play a role in real life.