The topic would be about the analysis of the data provided by the Austin Animal Shelter. I will try to derive some interesting conclusions or predictions basing on the animals that arrive to the shelter.
dane <- read_csv("Austin_Animal_Center_Outcomes_20240516.csv")
## Rows: 162475 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): Animal ID, Name, DateTime, MonthYear, Date of Birth, Outcome Type,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
At the first plot I will check the names of the animals in the center. The most suitable fort this would be a barplot.
d1 <- dane %>% select(Name) %>% drop_na() %>%
group_by(Name) %>% summarize( ile = n() ) %>%
arrange( desc(ile) )
d2 <- d1[1:10,]
ggplot( data = d2 , aes( x=reorder(Name,desc(ile)) , y = ile ,order = ile) ) +
geom_col() +
labs( x = "Pet name" , y="Count", title="Top 10 common names in Animal Shelter")
The second plot will shows us how many different types of animals comes to the animial center in diffreent periods.
d3 <- dane %>% select( type =`Animal Type`, dt= DateTime ) %>% drop_na() %>%
#select(type=`Animal Type`, dt= mdy_hms( DateTime ) )
mutate( m = month( mdy_hms( dt ) ,label=T,abbr=T,locale="US") , y = year( mdy_hms( dt ) ) ) %>%
filter( y==2023 ) %>%
group_by( type , m ) %>% summarize( s = n() )
## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.
ggplotly( ggplot( data = d3 , aes( x=m , y=s , group=type,fill=type ) ) + geom_area() +
labs( x = "Month" , y="Animal Count", title="2024 timeline of each animals count") )
d4 <- dane %>% select( type =`Animal Type`, dt= DateTime ,`Outcome Type`) %>% drop_na() %>%
#filter( `Outcome Type`=="Euthanasia") %>%
#select(type=`Animal Type`, dt= mdy_hms( DateTime ) )
mutate( y = date( mdy_hms( dt ) ) ) %>%
filter( year(y)>2020 ) %>%
group_by( y ) %>% summarize( s = n() )
#d4
ggplot( data = d4 , aes( x=y , y=s ) , alpha=0.01 ) + geom_line() +
stat_smooth() +
labs( x = "Date" , y="Animal Arrival Count", title="Timeline of animal arrival during 2021-2024")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
d44 <- dane %>% select( type =`Animal Type`, dt= DateTime ,`Outcome Type` ) %>% drop_na() %>%
filter( `Outcome Type`=="Return to Owner") %>%
mutate( y = date( mdy_hms( dt ) ) ) %>%
filter( year(y)>2020 ) %>%
mutate( d = wday(y,label=T,abbr=T,locale="US") , m=month(y,label=T,abbr=F,locale="US") ) %>%
group_by( m, d ) %>% summarize( s = n() )
## `summarise()` has grouped output by 'm'. You can override using the `.groups`
## argument.
#d44
ggplotly( ggplot( data = d44 , aes( x=d , y=s ) ) +
geom_col( colour="gray" , width=0.1 ) +
geom_point( colour = "black" , size = 2 ) +
facet_wrap(.~m) +
#stat_smooth() +
labs( x = "Weekday" , y="Animal Taken", title="Return to owner during whole year") )
d45 <- dane %>% select( type =`Animal Type`, dt= DateTime ,`Outcome Type` ) %>% drop_na() %>%
filter( `Outcome Type`=="Return to Owner") %>%
mutate( y = mdy_hms( dt ) ) %>%
filter( year(y)>2020 ) %>%
mutate( d = wday(y,label=T,abbr=T,locale="US") , m=hour(y) ) %>%
group_by( m, d ) %>% summarize( s = n() )
## `summarise()` has grouped output by 'm'. You can override using the `.groups`
## argument.
#d45
ggplotly( ggplot( data = d45 , aes( x=m , y=s ) ) +
geom_col() +
facet_wrap(.~d) +
#stat_smooth() +
labs( x = "Time" , y="Animal Taken", title="Return to owner during whole week") )
d5 <- dane %>% drop_na() %>% group_by(`Sex upon Outcome`) %>% summarize( s = n() )
pie( d5$s, d5$`Sex upon Outcome` , col = c("purple", "violetred1", "green3",
"cornsilk", "cyan", "white"),
border = 400, main = "Gender of the animals in the center")
The third visulaization would try to predict which breed of dog would probably come to animal center. Later we may investigate why would it occur. Barplot would work nice here.
norm_age <- function( sx ){
sapply(sx, function(s) {
nr <- parse_number(s)
if( length(nr)>1 ) return(NA)
if( nr< 0 ) nr <- nr*(-1)
if( str_detect( s , "week|weeks") ){
nr <- 1
}
if( str_detect( s , "month|months") ){
nr <- nr
}
if( str_detect( s , "year|years") ) {
nr <- nr*12
}
return(nr) })
}
d5 <- dane %>% drop_na() %>% filter(`Age upon Outcome`!="NULL") %>% #group_by( `Animal ID`) %>% ungroup() %>%
mutate( age = norm_age(`Age upon Outcome`) ) %>%
select( `Animal Type` , `Age upon Outcome` , age )
ggplotly( ggplot( data=d5 , aes( y=age , x = `Animal Type`,fill = `Animal Type` ) ) +
geom_boxplot() +
labs( y="Animal Age in Months" , title = "Age of animals") )
The third visulaization would try to predict which breed of dog would probably come to animal center. Later we may investigate why would it occur. Barplot would work nice here.
no_na <- dane %>% drop_na()
confusion_matrix <- no_na %>% group_by( `Outcome Type` ,`Animal Type`) %>% summarize( n = n() )
## `summarise()` has grouped output by 'Outcome Type'. You can override using the
## `.groups` argument.
cnt <- no_na %>% group_by( `Animal Type`) %>% summarize( n = n() )
confusion_matrix$n <- as.double(confusion_matrix$n)
confusion_matrix[ confusion_matrix$`Animal Type`=="Bird", ]$n <- confusion_matrix[ confusion_matrix$`Animal Type`=="Bird", ]$n/83
confusion_matrix[ confusion_matrix$`Animal Type`=="Cat", ]$n <- confusion_matrix[ confusion_matrix$`Animal Type`=="Cat", ]$n/17400
confusion_matrix[ confusion_matrix$`Animal Type`=="Dog", ]$n <- confusion_matrix[ confusion_matrix$`Animal Type`=="Dog", ]$n/22006
confusion_matrix[ confusion_matrix$`Animal Type`=="Other", ]$n <- confusion_matrix[ confusion_matrix$`Animal Type`=="Other", ]$n/748
ggplotly( ggplot(confusion_matrix, aes(x = `Animal Type`, y = `Outcome Type`, fill = n)) +
geom_tile(color = "gray") + # Adjust border color
scale_fill_gradient(name = "Precentage", low = "gray", high = "black") + # Color gradient
labs(title = "What happens to specific animals regarding species", x = "Animal in Consideration", y = "Outcome") +
theme_bw() )
The main data source would be Austin’s Shelter open dataset:
Data additional information:
Data specific details:
Columns in data: