library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read_csv("C:/Users/barre/Downloads/animal_shelter.csv")
## Rows: 79672 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (14): outcome_type, sex_upon_outcome, outcome_monthyear, outcome_weekda...
## dbl   (6): age_upon_outcome_years, outcome_number, intake_hour, intake_numbe...
## date  (1): date_of_birth
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Question 1: Plot Data

This histogram shows the age distribution of animals when they leave the shelter (via adoption, return to owner, etc.). Most animals are between 0 and 10 years old, with fewer older animals.

ggplot(data, aes(x = age_upon_outcome_years)) +
  geom_histogram(binwidth = 1, fill = "pink", color = "black") +
  labs(
    title = "Distribution of Age Upon Outcome",
    x = "Age (Years)",
    y = "Number of Animals"
  ) +
  theme_minimal()

Question 2: Statistical Calculation and Visualization

I calculated the average age of animals for each outcome type using the age_upon_outcome_years column. The goal is to explore whether age plays a role in what happens to animals after they enter the shelter

.

mean_age_outcome <- data %>%
  group_by(outcome_type) %>%
  summarise(mean_age = mean(age_upon_outcome_years, na.rm = TRUE)) %>%
  arrange(desc(mean_age))
ggplot(mean_age_outcome, aes(x = reorder(outcome_type, mean_age), y = mean_age)) +
  geom_col(fill = "purple") +
  coord_flip() +
  labs(
    title = "Average Age by Outcome Type",
    x = "Outcome Type",
    y = "Average Age (Years)"
  )

Question 3: Correlation or Regression Analysis.

To answer the question “Do older animals tend to stay longer or shorter in the shelter?”I will test the correlation Between age_upon_intake_years and time_in_shelter_days. Since our resulting p-value is less than 2.2e-16, we reject the null hypothesis that there is no correlation between an animal’s age at intake and the amount of time it spends in the shelter.

cor_test <- cor.test(data$age_upon_intake_years,data$time_in_shelter_days)

cor_test
## 
##  Pearson's product-moment correlation
## 
## data:  data$age_upon_intake_years and data$time_in_shelter_days
## t = 10.702, df = 79670, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.03095172 0.04481936
## sample estimates:
##        cor 
## 0.03788737

Question 4:

The histogram shows a right-skewed distribution of time_in_shelter_days, meaning that the majority of animals stay in the shelter for a short period, typically between 0 and 10 days.

library(ggplot2)

ggplot(data, aes(x = time_in_shelter_days)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Time in Shelter (Days)",
       x = "Time in Shelter (Days)",
       y = "Number of Animals") +
  theme_minimal() +
  xlim(0, 30)
## Warning: Removed 12006 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Question 5: Group Comparison Test

To determine whether dogs and cats spend significantly different amounts of time in the shelter I first divided the data set in two rows: Dogs and cats based on the column time_in_shelter_days. The Shapiro-Wilk test showed that the data is not normally distributed,so I used the Wilcoxon rank-sum test. The test result ( p < 0.001) indicates that there is a statistically significant difference in shelter time between cats and dogs.

dog_cat_data <- data %>% filter(animal_type %in% c("Dog","Cat"))

shapiro.test(sample(dog_cat_data$time_in_shelter_days[dog_cat_data$animal_type == "Dog"], 5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(dog_cat_data$time_in_shelter_days[dog_cat_data$animal_type == "Dog"], 5000)
## W = 0.37476, p-value < 2.2e-16
shapiro.test(sample(dog_cat_data$time_in_shelter_days[dog_cat_data$animal_type == "Cat"], 5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(dog_cat_data$time_in_shelter_days[dog_cat_data$animal_type == "Cat"], 5000)
## W = 0.56659, p-value < 2.2e-16
wilcox.test(time_in_shelter_days ~ animal_type, data = dog_cat_data)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  time_in_shelter_days by animal_type
## W = 691971328, p-value = 3.318e-14
## alternative hypothesis: true location shift is not equal to 0