library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read_csv("C:/Users/barre/Downloads/animal_shelter.csv")
## Rows: 79672 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): outcome_type, sex_upon_outcome, outcome_monthyear, outcome_weekda...
## dbl (6): age_upon_outcome_years, outcome_number, intake_hour, intake_numbe...
## date (1): date_of_birth
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(data, aes(x = age_upon_outcome_years)) +
geom_histogram(binwidth = 1, fill = "pink", color = "black") +
labs(
title = "Distribution of Age Upon Outcome",
x = "Age (Years)",
y = "Number of Animals"
) +
theme_minimal()
age_upon_outcome_years column. The goal is to explore
whether age plays a role in what happens to animals after they enter the
shelter.
mean_age_outcome <- data %>%
group_by(outcome_type) %>%
summarise(mean_age = mean(age_upon_outcome_years, na.rm = TRUE)) %>%
arrange(desc(mean_age))
ggplot(mean_age_outcome, aes(x = reorder(outcome_type, mean_age), y = mean_age)) +
geom_col(fill = "purple") +
coord_flip() +
labs(
title = "Average Age by Outcome Type",
x = "Outcome Type",
y = "Average Age (Years)"
)
cor_test <- cor.test(data$age_upon_intake_years,data$time_in_shelter_days)
cor_test
##
## Pearson's product-moment correlation
##
## data: data$age_upon_intake_years and data$time_in_shelter_days
## t = 10.702, df = 79670, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03095172 0.04481936
## sample estimates:
## cor
## 0.03788737
library(ggplot2)
ggplot(data, aes(x = time_in_shelter_days)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(title = "Distribution of Time in Shelter (Days)",
x = "Time in Shelter (Days)",
y = "Number of Animals") +
theme_minimal() +
xlim(0, 30)
## Warning: Removed 12006 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
dog_cat_data <- data %>% filter(animal_type %in% c("Dog","Cat"))
shapiro.test(sample(dog_cat_data$time_in_shelter_days[dog_cat_data$animal_type == "Dog"], 5000))
##
## Shapiro-Wilk normality test
##
## data: sample(dog_cat_data$time_in_shelter_days[dog_cat_data$animal_type == "Dog"], 5000)
## W = 0.37476, p-value < 2.2e-16
shapiro.test(sample(dog_cat_data$time_in_shelter_days[dog_cat_data$animal_type == "Cat"], 5000))
##
## Shapiro-Wilk normality test
##
## data: sample(dog_cat_data$time_in_shelter_days[dog_cat_data$animal_type == "Cat"], 5000)
## W = 0.56659, p-value < 2.2e-16
wilcox.test(time_in_shelter_days ~ animal_type, data = dog_cat_data)
##
## Wilcoxon rank sum test with continuity correction
##
## data: time_in_shelter_days by animal_type
## W = 691971328, p-value = 3.318e-14
## alternative hypothesis: true location shift is not equal to 0