library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(dplyr)
library(ggplot2)
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
animals <- read.csv("acs_animals.csv")
# Check structure
str(animals)
## 'data.frame': 62235 obs. of 19 variables:
## $ Intake.Fiscal.Year : int 2024 2024 2024 2024 2024 2024 2024 2024 2024 2024 ...
## $ Intake.Fiscal.Period : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Intake.Date : chr "10/1/2023" "10/1/2023" "10/1/2023" "10/1/2023" ...
## $ Intake.Section : chr "OVER-THE-COUNTER (OTC)" "OVER-THE-COUNTER (OTC)" "FIELD" "FIELD" ...
## $ Intake.Subsection : chr "Quarantine" "Stray" "Stray" "Stray" ...
## $ Animal.ID : chr "A680497" "A685667" "A689476" "A689477" ...
## $ Kennel.Identity : int 545763 545750 545744 545745 545746 545747 545748 545749 545752 545753 ...
## $ Species : chr "DOG" "CAT" "DOG" "DOG" ...
## $ Current.Sex : chr "N" "N" "M" "F" ...
## $ Council.District : chr "5" "10" "2" "5" ...
## $ Zip.Code : int 78204 78217 78203 78225 78201 78237 78237 78233 78226 78210 ...
## $ Animal.Size : chr "MED" "SMALL" "LARGE" "LARGE" ...
## $ Breed.Group : chr "MASTIFF" "SHORTHAIR" "MASTIFF" "SETTER/RETRIEVE" ...
## $ Primary.Color : chr "WHITE" "RED" "GRAY" "BLACK" ...
## $ Animal.Age.Group : chr "1 Yr - 5 Yr" "7 wks - 6 Mo" "1 Yr - 5 Yr" "1 Yr - 5 Yr" ...
## $ Sterilization.Status.at.Intake: chr "Sterilized at Intake" "Sterilized at Intake" "Intact at Intake" "Intact at Intake" ...
## $ Owned.Status : chr "Owner Infomation Found" "No Owner Info Found" "No Owner Info Found" "Owner Infomation Found" ...
## $ Sterilized_DV : int 1 1 0 0 0 0 0 0 0 0 ...
## $ Stray_DV : int 0 1 1 0 1 1 1 0 1 0 ...
This variable is measuring Sterilization Status at Intake (whether the animal was already sterilized when it arrived). By looking at sterilization status as the main outcome, and testing its relationship with species, age, and location, we can learn a great deal about how animal welfare practices vary across the city. The null hypothesis assumes that sterilization is equally likely across all groups, while the alternative hypothesis suggests meaningful differences. If the data support the alternative, this insight could lead to smarter, more targeted strategies to reduce intake numbers and improve animal well-being in San Antonio.
It measures a binary outcome — either the animal was
sterilized (1) or not sterilized
(0).
This helps identify trends in responsible pet ownership across different
areas or animal groups. I have used pastecs::stat.desc.
pastecs::stat.desc
## function (x, basic = TRUE, desc = TRUE, norm = FALSE, p = 0.95)
## {
## stat.desc.vec <- function(x, basic, desc, norm, p) {
## x <- unlist(x)
## if (!is.numeric(x)) {
## Nbrval <- NA
## Nbrnull <- NA
## Nbrna <- NA
## Median <- NA
## Mean <- NA
## StdDev <- NA
## if (basic == TRUE) {
## Res1 <- list(nbr.val = NA, nbr.null = NA, nbr.na = NA,
## min = NA, max = NA, range = NA, sum = NA)
## }
## else Res1 <- NULL
## if (desc == TRUE) {
## CIMean <- NA
## names(CIMean) <- p
## Res2 <- list(median = NA, mean = NA, SE.mean = NA,
## CI.mean = NA, var = NA, std.dev = NA, coef.var = NA)
## }
## else Res2 <- NULL
## if (norm == TRUE) {
## Res3 <- list(skewness = NA, skew.2SE = NA, kurtosis = NA,
## kurt.2SE = NA, normtest.W = NA, normtest.p = NA)
## }
## else Res3 <- NULL
## }
## else {
## Nbrna <- sum(as.numeric(is.na(x)))
## x <- x[!is.na(x)]
## Nbrval <- length(x)
## Nbrnull <- sum(as.numeric(x == 0))
## if (basic == TRUE) {
## Min <- min(x)
## Max <- max(x)
## Range <- Max - Min
## Sum <- sum(x)
## Res1 <- list(nbr.val = Nbrval, nbr.null = Nbrnull,
## nbr.na = Nbrna, min = Min, max = Max, range = Range,
## sum = Sum)
## }
## else Res1 <- NULL
## Median <- median(x)
## names(Median) <- NULL
## Mean <- mean(x)
## Var <- var(x)
## StdDev <- sqrt(Var)
## SEMean <- StdDev/sqrt(Nbrval)
## if (desc == TRUE) {
## CIMean <- qt((0.5 + p/2), (Nbrval - 1)) * SEMean
## names(CIMean) <- p
## CoefVar <- StdDev/Mean
## Res2 <- list(median = Median, mean = Mean, SE.mean = SEMean,
## CI.mean = CIMean, var = Var, std.dev = StdDev,
## coef.var = CoefVar)
## }
## else Res2 <- NULL
## if (norm == TRUE) {
## Skew <- sum((x - mean(x))^3)/(length(x) * sqrt(var(x))^3)
## Kurt <- sum((x - mean(x))^4)/(length(x) * var(x)^2) -
## 3
## SE <- sqrt(6 * Nbrval * (Nbrval - 1)/(Nbrval -
## 2)/(Nbrval + 1)/(Nbrval + 3))
## Skew.2SE <- Skew/(2 * SE)
## SE <- sqrt(24 * Nbrval * ((Nbrval - 1)^2)/(Nbrval -
## 3)/(Nbrval - 2)/(Nbrval + 3)/(Nbrval + 5))
## Kurt.2SE <- Kurt/(2 * SE)
## Ntest <- shapiro.test(x)
## Ntest.W <- Ntest$statistic
## names(Ntest.W) <- NULL
## Ntest.p <- Ntest$p.value
## Res3 <- list(skewness = Skew, skew.2SE = Skew.2SE,
## kurtosis = Kurt, kurt.2SE = Kurt.2SE, normtest.W = Ntest.W,
## normtest.p = Ntest.p)
## }
## else Res3 <- NULL
## }
## Res <- unlist(c(Res1, Res2, Res3))
## if (length(Res) == 0)
## Res <- unlist(list(nbr.val = Nbrval, nbr.null = Nbrnull,
## nbr.na = Nbrna, median = Median, mean = Mean,
## std.dev = StdDev))
## Res
## }
## Basic <- basic
## Desc <- desc
## Norm <- norm
## P <- p
## if (is.vector(x))
## stat.desc.vec(x, Basic, Desc, Norm, P)
## else {
## x <- as.data.frame(x)
## NamesV <- names(x)
## StatM <- NULL
## for (i in 1:ncol(x)) {
## StatV <- stat.desc.vec(x[i], Basic, Desc, Norm, P)
## if (is.null(StatM) == TRUE)
## StatM <- data.frame(StatV)
## else StatM <- cbind(StatM, StatV)
## }
## names(StatM) <- NamesV
## StatM
## }
## }
## <bytecode: 0x1211aa5d8>
## <environment: namespace:pastecs>
# Sterilization Status at Intake
animals_clean <- animals %>%
filter(!is.na(Sterilization.Status.at.Intake)) # remove NAs if any
# View first few rows
head(animals_clean$Sterilization.Status.at.Intake)
## [1] "Sterilized at Intake" "Sterilized at Intake" "Intact at Intake"
## [4] "Intact at Intake" "Intact at Intake" "Intact at Intake"
# Convert to numeric indicator: 1 = Sterilized, 0 = Intact
animals_clean <- animals_clean %>%
mutate(Sterilized_Numeric = ifelse(Sterilization.Status.at.Intake == "Sterilized at Intake", 1, 0))
# Descriptive statistics
stat.desc(animals_clean$Sterilized_Numeric)
## nbr.val nbr.null nbr.na min max range
## 6.223500e+04 5.753700e+04 0.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00
## sum median mean SE.mean CI.mean.0.95 var
## 4.698000e+03 0.000000e+00 7.548807e-02 1.058965e-03 2.075573e-03 6.979074e-02
## std.dev coef.var
## 2.641794e-01 3.499618e+00
ggplot(animals_clean, aes(x = Sterilized_Numeric)) +
geom_histogram(binwidth = 0.5, fill = "steelblue", color = "white") +
labs(title = "Histogram: Sterilization Status at Intake",
x = "Sterilization Status (0 = Intact, 1 = Sterilized)",
y = "Count") +
theme_minimal()
Since this variable is binary, transformations (log or sqrt) are not
meaningful.
However, for demonstration, we’ll apply a square root
transformation anyway to show the process.
animals_clean <- animals_clean %>%
mutate(Sterilized_sqrt = sqrt(Sterilized_Numeric))
# Describe transformed variable
stat.desc(animals_clean$Sterilized_sqrt)
## nbr.val nbr.null nbr.na min max range
## 6.223500e+04 5.753700e+04 0.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00
## sum median mean SE.mean CI.mean.0.95 var
## 4.698000e+03 0.000000e+00 7.548807e-02 1.058965e-03 2.075573e-03 6.979074e-02
## std.dev coef.var
## 2.641794e-01 3.499618e+00
ggplot(animals_clean, aes(x = Sterilized_sqrt)) +
geom_histogram(binwidth = 0.5, fill = "darkorange", color = "white") +
labs(title = "Histogram: Square Root of Sterilization Status",
x = "Square Root of Sterilization Status",
y = "Count") +
theme_minimal()
unsterilized <- animals_clean %>%
filter(Sterilized_Numeric == 0)
# Create bar chart
ggplot(unsterilized, aes(x = Council.District)) +
geom_bar(fill = "firebrick", color = "white") +
labs(title = "Unsterilized Animals by Council District",
x = "Council District",
y = "Number of Unsterilized Animals") +
theme_minimal()
district_summary <- animals_clean %>%
group_by(Council.District) %>%
summarise(
Total = n(),
Unsterilized = sum(Sterilized_Numeric == 0),
Percent_Unsterilized = (Unsterilized / Total) * 100
)
# View the summary
district_summary
## # A tibble: 12 × 4
## Council.District Total Unsterilized Percent_Unsterilized
## <chr> <int> <int> <dbl>
## 1 1 5979 5453 91.2
## 2 10 3691 3230 87.5
## 3 2 7964 7492 94.1
## 4 3 9086 8561 94.2
## 5 4 8657 8056 93.1
## 6 5 12239 11554 94.4
## 7 6 6013 5468 90.9
## 8 7 4705 4258 90.5
## 9 8 2050 1834 89.5
## 10 817 B CNTY 1 1 100
## 11 9 1316 1165 88.5
## 12 OTHER 534 465 87.1
# Plot percentages
ggplot(district_summary, aes(x = Council.District, y = Percent_Unsterilized)) +
geom_col(fill = "darkred") +
labs(title = "Percentage of Unsterilized Animals by Council District",
x = "Council District",
y = "Percent Unsterilized (%)") +
theme_minimal()