Final Report

Analysis of the Animal Center

The topic would be about the analysis of the data provided by the Austin Animal Shelter. I will try to derive some interesting conclusions or predictions basing on the animals that arrive to the shelter.

dane <- read_csv("Austin_Animal_Center_Outcomes_20240516.csv")

## Rows: 162475 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): Animal ID, Name, DateTime, MonthYear, Date of Birth, Outcome Type,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Visulaization 1

At the first plot I will check the names of the animals in the center. The most suitable fort this would be a barplot.

d1 <- dane %>% select(Name) %>% drop_na() %>%
  group_by(Name) %>% summarize( ile = n() ) %>%
  arrange( desc(ile)  )
d2 <- d1[1:10,]
ggplot( data = d2 , aes( x=reorder(Name,desc(ile)) , y = ile ,order = ile) ) +
  geom_col() +
  labs( x = "Pet name" , y="Count", title="Top 10 common names in Animal Shelter")

Visulaization 2

The second plot will shows us how many different types of animals comes to the animial center in diffreent periods.

d3 <- dane %>% select( type =`Animal Type`, dt= DateTime  ) %>% drop_na() %>%
  #select(type=`Animal Type`, dt= mdy_hms( DateTime ) )
  mutate( m = month( mdy_hms( dt ) ,label=T,abbr=T,locale="US")  , y = year( mdy_hms( dt ) ) ) %>%
  filter( y==2023 ) %>%
  group_by( type  , m ) %>% summarize( s = n() )

## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.

ggplotly( ggplot( data = d3 , aes( x=m , y=s , group=type,fill=type ) ) + geom_area() +
  labs( x = "Month" , y="Animal Count", title="2024 timeline of each animals count") )

Visulaization 3

d4 <- dane %>% select( type =`Animal Type`, dt= DateTime  ,`Outcome Type`) %>% drop_na() %>%
  #filter( `Outcome Type`=="Euthanasia") %>%
  #select(type=`Animal Type`, dt= mdy_hms( DateTime ) )
  mutate( y = date( mdy_hms( dt ) ) ) %>%
  filter( year(y)>2020 ) %>%
  group_by(  y ) %>% summarize( s = n() )
#d4
ggplot( data = d4 , aes( x=y , y=s  ) , alpha=0.01 ) + geom_line() +
  stat_smooth() +
  labs( x = "Date" , y="Animal Arrival Count", title="Timeline of animal arrival during 2021-2024")

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Visulaization 4

d44 <- dane %>% select( type =`Animal Type`, dt= DateTime ,`Outcome Type` ) %>% drop_na() %>%
  filter( `Outcome Type`=="Return to Owner") %>%
  mutate( y = date( mdy_hms( dt ) ) ) %>%
  filter( year(y)>2020 ) %>%
  mutate( d = wday(y,label=T,abbr=T,locale="US") , m=month(y,label=T,abbr=F,locale="US")  ) %>%
  group_by( m, d ) %>% summarize( s = n() )

## `summarise()` has grouped output by 'm'. You can override using the `.groups`
## argument.

#d44
ggplotly( ggplot( data = d44 , aes( x=d , y=s  )  ) + 
  geom_col( colour="gray" , width=0.1 ) +
  geom_point( colour = "black" , size = 2 ) +
  facet_wrap(.~m) +
  #stat_smooth() +
  labs( x = "Weekday" , y="Animal Taken", title="Return to owner during whole year") )

Visulaization 5

d45 <- dane %>% select( type =`Animal Type`, dt= DateTime ,`Outcome Type` ) %>% drop_na() %>%
  filter( `Outcome Type`=="Return to Owner") %>%
  mutate( y =  mdy_hms( dt )  ) %>%
  filter( year(y)>2020 ) %>%
  mutate( d = wday(y,label=T,abbr=T,locale="US") , m=hour(y)  ) %>%
  group_by( m, d ) %>% summarize( s = n() )

## `summarise()` has grouped output by 'm'. You can override using the `.groups`
## argument.

#d45
ggplotly( ggplot( data = d45 , aes( x=m , y=s  )  ) + 
  geom_col() +
  facet_wrap(.~d) +
  #stat_smooth() +
  labs( x = "Time" , y="Animal Taken", title="Return to owner during whole week") )

Visulaization 6

d5 <- dane %>% drop_na() %>% group_by(`Sex upon Outcome`) %>% summarize( s = n() )

pie( d5$s, d5$`Sex upon Outcome` , col = c("purple", "violetred1", "green3",
                                           "cornsilk", "cyan", "white"),
     border = 400, main = "Gender of the animals in the center")

Visulaization 7

The third visulaization would try to predict which breed of dog would probably come to animal center. Later we may investigate why would it occur. Barplot would work nice here.

norm_age <- function( sx ){
  sapply(sx, function(s) {
  nr <- parse_number(s)
  if( length(nr)>1 ) return(NA)
  if( nr< 0 ) nr <- nr*(-1)
  if(  str_detect( s , "week|weeks")  ){
    nr <- 1
  }
  if(  str_detect( s , "month|months")  ){
    nr <- nr
  }
  if(  str_detect( s , "year|years") ) {
    nr <- nr*12
  }
  return(nr) })
}

d5 <- dane %>% drop_na() %>% filter(`Age upon Outcome`!="NULL") %>% #group_by(  `Animal ID`) %>%  ungroup() %>%
  mutate( age = norm_age(`Age upon Outcome`) ) %>%
  select( `Animal Type` , `Age upon Outcome` , age )


ggplotly( ggplot( data=d5 , aes( y=age , x = `Animal Type`,fill = `Animal Type` ) ) +
  geom_boxplot() +
  labs( y="Animal Age in Months" , title = "Age of animals") )

Visulaization 8

The third visulaization would try to predict which breed of dog would probably come to animal center. Later we may investigate why would it occur. Barplot would work nice here.

no_na <- dane %>% drop_na()
confusion_matrix <- no_na %>% group_by(   `Outcome Type` ,`Animal Type`) %>% summarize( n = n() )

## `summarise()` has grouped output by 'Outcome Type'. You can override using the
## `.groups` argument.

cnt <- no_na %>% group_by(  `Animal Type`) %>% summarize( n = n() )



confusion_matrix$n <- as.double(confusion_matrix$n)

confusion_matrix[ confusion_matrix$`Animal Type`=="Bird", ]$n <- confusion_matrix[ confusion_matrix$`Animal Type`=="Bird", ]$n/83
confusion_matrix[ confusion_matrix$`Animal Type`=="Cat", ]$n <- confusion_matrix[ confusion_matrix$`Animal Type`=="Cat", ]$n/17400
confusion_matrix[ confusion_matrix$`Animal Type`=="Dog", ]$n <- confusion_matrix[ confusion_matrix$`Animal Type`=="Dog", ]$n/22006
confusion_matrix[ confusion_matrix$`Animal Type`=="Other", ]$n <- confusion_matrix[ confusion_matrix$`Animal Type`=="Other", ]$n/748


ggplotly( ggplot(confusion_matrix, aes(x = `Animal Type`, y = `Outcome Type`, fill = n)) +
  geom_tile(color = "gray") +  # Adjust border color
  scale_fill_gradient(name = "Precentage", low = "gray", high = "black") +  # Color gradient
  labs(title = "What happens to specific animals regarding species", x = "Animal in Consideration", y = "Outcome")  +
  theme_bw() )

Data Sources

The main data source would be Austin’s Shelter open dataset:

https://data.austintexas.gov/Health-and-Community-Services/Austin-Animal-Center-Outcomes/9t4d-g238/about_data
The data were collected by the employees of the shelter.

Data additional information:

Data Provided by City of Austin, Texas - data.austintexas.gov
Dataset Owner Duron Davis
Category Health and Community Services
Tags: outcomes,animal,cat,dog,pet,no kill,transfer,adoption,missing
License Public Domain
Source Link http://www.austintexas.gov/department/animal-services

Description of the Data

Data specific details:

Timespan in years 2013-2014
Rows 162K
- Each row is a one outcome per animal per encounter
Columns 12
Data contains only information abour Animal Center in Ausitn in Texas

Columns in data:

Animal ID - Plain Text
Name - Plain Text
DateTime - Date & Time
MonthYear - Date & Time
Date of Birth - Date & Time
Outcome Type - Plain Text
Outcome Subtype - Plain Text
Animal Type - Plain Text
Sex upon Outcome- Plain Text
Age upon Outcome - Plain Text
Breed - Plain Text
Color - Plain Text