library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(dplyr)
library(ggplot2)
library(pastecs)
## 
## Attaching package: 'pastecs'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

Data Set

animals <- read.csv("acs_animals.csv")

# Check structure
str(animals)
## 'data.frame':    62235 obs. of  19 variables:
##  $ Intake.Fiscal.Year            : int  2024 2024 2024 2024 2024 2024 2024 2024 2024 2024 ...
##  $ Intake.Fiscal.Period          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Intake.Date                   : chr  "10/1/2023" "10/1/2023" "10/1/2023" "10/1/2023" ...
##  $ Intake.Section                : chr  "OVER-THE-COUNTER (OTC)" "OVER-THE-COUNTER (OTC)" "FIELD" "FIELD" ...
##  $ Intake.Subsection             : chr  "Quarantine" "Stray" "Stray" "Stray" ...
##  $ Animal.ID                     : chr  "A680497" "A685667" "A689476" "A689477" ...
##  $ Kennel.Identity               : int  545763 545750 545744 545745 545746 545747 545748 545749 545752 545753 ...
##  $ Species                       : chr  "DOG" "CAT" "DOG" "DOG" ...
##  $ Current.Sex                   : chr  "N" "N" "M" "F" ...
##  $ Council.District              : chr  "5" "10" "2" "5" ...
##  $ Zip.Code                      : int  78204 78217 78203 78225 78201 78237 78237 78233 78226 78210 ...
##  $ Animal.Size                   : chr  "MED" "SMALL" "LARGE" "LARGE" ...
##  $ Breed.Group                   : chr  "MASTIFF" "SHORTHAIR" "MASTIFF" "SETTER/RETRIEVE" ...
##  $ Primary.Color                 : chr  "WHITE" "RED" "GRAY" "BLACK" ...
##  $ Animal.Age.Group              : chr  "1 Yr - 5 Yr" "7 wks - 6 Mo" "1 Yr - 5 Yr" "1 Yr - 5 Yr" ...
##  $ Sterilization.Status.at.Intake: chr  "Sterilized at Intake" "Sterilized at Intake" "Intact at Intake" "Intact at Intake" ...
##  $ Owned.Status                  : chr  "Owner Infomation Found" "No Owner Info Found" "No Owner Info Found" "Owner Infomation Found" ...
##  $ Sterilized_DV                 : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ Stray_DV                      : int  0 1 1 0 1 1 1 0 1 0 ...

Variables

This variable is measuring Sterilization Status at Intake (whether the animal was already sterilized when it arrived). By looking at sterilization status as the main outcome, and testing its relationship with species, age, and location, we can learn a great deal about how animal welfare practices vary across the city. The null hypothesis assumes that sterilization is equally likely across all groups, while the alternative hypothesis suggests meaningful differences. If the data support the alternative, this insight could lead to smarter, more targeted strategies to reduce intake numbers and improve animal well-being in San Antonio.

It measures a binary outcome — either the animal was sterilized (1) or not sterilized (0).
This helps identify trends in responsible pet ownership across different areas or animal groups. I have used pastecs::stat.desc.

pastecs::stat.desc 
## function (x, basic = TRUE, desc = TRUE, norm = FALSE, p = 0.95) 
## {
##     stat.desc.vec <- function(x, basic, desc, norm, p) {
##         x <- unlist(x)
##         if (!is.numeric(x)) {
##             Nbrval <- NA
##             Nbrnull <- NA
##             Nbrna <- NA
##             Median <- NA
##             Mean <- NA
##             StdDev <- NA
##             if (basic == TRUE) {
##                 Res1 <- list(nbr.val = NA, nbr.null = NA, nbr.na = NA, 
##                   min = NA, max = NA, range = NA, sum = NA)
##             }
##             else Res1 <- NULL
##             if (desc == TRUE) {
##                 CIMean <- NA
##                 names(CIMean) <- p
##                 Res2 <- list(median = NA, mean = NA, SE.mean = NA, 
##                   CI.mean = NA, var = NA, std.dev = NA, coef.var = NA)
##             }
##             else Res2 <- NULL
##             if (norm == TRUE) {
##                 Res3 <- list(skewness = NA, skew.2SE = NA, kurtosis = NA, 
##                   kurt.2SE = NA, normtest.W = NA, normtest.p = NA)
##             }
##             else Res3 <- NULL
##         }
##         else {
##             Nbrna <- sum(as.numeric(is.na(x)))
##             x <- x[!is.na(x)]
##             Nbrval <- length(x)
##             Nbrnull <- sum(as.numeric(x == 0))
##             if (basic == TRUE) {
##                 Min <- min(x)
##                 Max <- max(x)
##                 Range <- Max - Min
##                 Sum <- sum(x)
##                 Res1 <- list(nbr.val = Nbrval, nbr.null = Nbrnull, 
##                   nbr.na = Nbrna, min = Min, max = Max, range = Range, 
##                   sum = Sum)
##             }
##             else Res1 <- NULL
##             Median <- median(x)
##             names(Median) <- NULL
##             Mean <- mean(x)
##             Var <- var(x)
##             StdDev <- sqrt(Var)
##             SEMean <- StdDev/sqrt(Nbrval)
##             if (desc == TRUE) {
##                 CIMean <- qt((0.5 + p/2), (Nbrval - 1)) * SEMean
##                 names(CIMean) <- p
##                 CoefVar <- StdDev/Mean
##                 Res2 <- list(median = Median, mean = Mean, SE.mean = SEMean, 
##                   CI.mean = CIMean, var = Var, std.dev = StdDev, 
##                   coef.var = CoefVar)
##             }
##             else Res2 <- NULL
##             if (norm == TRUE) {
##                 Skew <- sum((x - mean(x))^3)/(length(x) * sqrt(var(x))^3)
##                 Kurt <- sum((x - mean(x))^4)/(length(x) * var(x)^2) - 
##                   3
##                 SE <- sqrt(6 * Nbrval * (Nbrval - 1)/(Nbrval - 
##                   2)/(Nbrval + 1)/(Nbrval + 3))
##                 Skew.2SE <- Skew/(2 * SE)
##                 SE <- sqrt(24 * Nbrval * ((Nbrval - 1)^2)/(Nbrval - 
##                   3)/(Nbrval - 2)/(Nbrval + 3)/(Nbrval + 5))
##                 Kurt.2SE <- Kurt/(2 * SE)
##                 Ntest <- shapiro.test(x)
##                 Ntest.W <- Ntest$statistic
##                 names(Ntest.W) <- NULL
##                 Ntest.p <- Ntest$p.value
##                 Res3 <- list(skewness = Skew, skew.2SE = Skew.2SE, 
##                   kurtosis = Kurt, kurt.2SE = Kurt.2SE, normtest.W = Ntest.W, 
##                   normtest.p = Ntest.p)
##             }
##             else Res3 <- NULL
##         }
##         Res <- unlist(c(Res1, Res2, Res3))
##         if (length(Res) == 0) 
##             Res <- unlist(list(nbr.val = Nbrval, nbr.null = Nbrnull, 
##                 nbr.na = Nbrna, median = Median, mean = Mean, 
##                 std.dev = StdDev))
##         Res
##     }
##     Basic <- basic
##     Desc <- desc
##     Norm <- norm
##     P <- p
##     if (is.vector(x)) 
##         stat.desc.vec(x, Basic, Desc, Norm, P)
##     else {
##         x <- as.data.frame(x)
##         NamesV <- names(x)
##         StatM <- NULL
##         for (i in 1:ncol(x)) {
##             StatV <- stat.desc.vec(x[i], Basic, Desc, Norm, P)
##             if (is.null(StatM) == TRUE) 
##                 StatM <- data.frame(StatV)
##             else StatM <- cbind(StatM, StatV)
##         }
##         names(StatM) <- NamesV
##         StatM
##     }
## }
## <bytecode: 0x1211aa5d8>
## <environment: namespace:pastecs>
# Sterilization Status at Intake
animals_clean <- animals %>%
  filter(!is.na(Sterilization.Status.at.Intake))  # remove NAs if any

# View first few rows
head(animals_clean$Sterilization.Status.at.Intake)
## [1] "Sterilized at Intake" "Sterilized at Intake" "Intact at Intake"    
## [4] "Intact at Intake"     "Intact at Intake"     "Intact at Intake"
# Convert to numeric indicator: 1 = Sterilized, 0 = Intact
animals_clean <- animals_clean %>%
  mutate(Sterilized_Numeric = ifelse(Sterilization.Status.at.Intake == "Sterilized at Intake", 1, 0))

# Descriptive statistics
stat.desc(animals_clean$Sterilized_Numeric)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 6.223500e+04 5.753700e+04 0.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 4.698000e+03 0.000000e+00 7.548807e-02 1.058965e-03 2.075573e-03 6.979074e-02 
##      std.dev     coef.var 
## 2.641794e-01 3.499618e+00

Histogram of Variable

ggplot(animals_clean, aes(x = Sterilized_Numeric)) +
  geom_histogram(binwidth = 0.5, fill = "steelblue", color = "white") +
  labs(title = "Histogram: Sterilization Status at Intake",
       x = "Sterilization Status (0 = Intact, 1 = Sterilized)",
       y = "Count") +
  theme_minimal()

Transform of Variable

Since this variable is binary, transformations (log or sqrt) are not meaningful.
However, for demonstration, we’ll apply a square root transformation anyway to show the process.

animals_clean <- animals_clean %>%
  mutate(Sterilized_sqrt = sqrt(Sterilized_Numeric))

# Describe transformed variable
stat.desc(animals_clean$Sterilized_sqrt)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 6.223500e+04 5.753700e+04 0.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 4.698000e+03 0.000000e+00 7.548807e-02 1.058965e-03 2.075573e-03 6.979074e-02 
##      std.dev     coef.var 
## 2.641794e-01 3.499618e+00

Histogram of Transformed Variable

ggplot(animals_clean, aes(x = Sterilized_sqrt)) +
  geom_histogram(binwidth = 0.5, fill = "darkorange", color = "white") +
  labs(title = "Histogram: Square Root of Sterilization Status",
       x = "Square Root of Sterilization Status",
       y = "Count") +
  theme_minimal()

Bar chart of Unsterilized animals by Council District

unsterilized <- animals_clean %>%
  filter(Sterilized_Numeric == 0)

# Create bar chart
ggplot(unsterilized, aes(x = Council.District)) +
  geom_bar(fill = "firebrick", color = "white") +
  labs(title = "Unsterilized Animals by Council District",
       x = "Council District",
       y = "Number of Unsterilized Animals") +
  theme_minimal()

Percentage of Unsterilized animals by Council District

district_summary <- animals_clean %>%
  group_by(Council.District) %>%
  summarise(
    Total = n(),
    Unsterilized = sum(Sterilized_Numeric == 0),
    Percent_Unsterilized = (Unsterilized / Total) * 100
  )

# View the summary
district_summary
## # A tibble: 12 × 4
##    Council.District Total Unsterilized Percent_Unsterilized
##    <chr>            <int>        <int>                <dbl>
##  1 1                 5979         5453                 91.2
##  2 10                3691         3230                 87.5
##  3 2                 7964         7492                 94.1
##  4 3                 9086         8561                 94.2
##  5 4                 8657         8056                 93.1
##  6 5                12239        11554                 94.4
##  7 6                 6013         5468                 90.9
##  8 7                 4705         4258                 90.5
##  9 8                 2050         1834                 89.5
## 10 817 B CNTY           1            1                100  
## 11 9                 1316         1165                 88.5
## 12 OTHER              534          465                 87.1
# Plot percentages
ggplot(district_summary, aes(x = Council.District, y = Percent_Unsterilized)) +
  geom_col(fill = "darkred") +
  labs(title = "Percentage of Unsterilized Animals by Council District",
       x = "Council District",
       y = "Percent Unsterilized (%)") +
  theme_minimal()