Introduction

The Purpose of this project was to create data visualizations using the Australian Animal Outcomes data set. The visualizations that I have created mainly answer questions related to how the outcomes vary from region to region in Australia, and also how they vary based on the animal category. One main area that I focused on was the Euthanization rate of Cats and Dogs, so there are more than one visualizations related to that. I also used web scraping to get the additional data related to computing the euthanization rate, like the population of the Australian States, and also their Latitudes and Longitudes, so that I can plot them on a map.

Set Up

## Clear environment
rm(list = ls())

#Loading the Libraries 
pacman::p_load(tidyverse,readr, data.table, kableExtra, leaflet, ggpubr, gganimate, magick,ggthemes, plotly, modelsummary,
               rvest,sf,maps,scales,dplyr)

Loading Data

This data set includes the data related to the animal outcomes in Australia from 1999 to 2018. There are 8 possible outcomes, namely, Reclaimed, Re-homed, Other, Euthanized, Released, In Stock, Transferred & Currently In Care.

#Loading the Data from Github, as data tables, by using the fread command
animal_outcomes <- fread('https://raw.githubusercontent.com/Rauhannazir/DV2/main/animal_outcomes.csv')

Define Custome Theme

theme_syshtu <- function(){ 
  font <- "Times"   #assign font family up front
  
  theme_minimal() %+replace%    
    
    theme(
      
      #grid elements
      panel.grid.minor = element_blank(),
      panel.background = element_blank(),
      
      #text elements
      plot.title = element_text(             
        family = font,            
        size = 12,                
        hjust = 0.5,                
        vjust = 2,
        color = "Black",
        face= "bold"),               
      
      plot.subtitle = element_text(
        family = font,
        size = 10,
        hjust = 0.5,
        color = "#2ca25f"),
      
      plot.caption = element_text(  
        family = font,
        size = 6,        
        hjust = 1),      
      
      axis.title = element_text(    
        family = font,   
        size = 10),      
      
      axis.text = element_text(     
        family = font,   
        size = 9),       
      
      axis.text.x = element_text(   
        margin=margin(5, b = 10))
    )
}

Data Exploration

I explored the data to see if there were any value so that they can be dealt with. However, in our data set there were were only a few missing values and did not need any modification. After that I did some data exploratory analysis like checking the structure and distribution of the variables.

#Checking for missing values 
to_filter <- sapply(animal_outcomes  , function(x) sum(is.na(x)))

#Data Exploration 
glimpse(animal_outcomes)
## Rows: 664
## Columns: 12
## $ year        <int> 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999~
## $ animal_type <chr> "Dogs", "Dogs", "Dogs", "Dogs", "Cats", "Cats", "Cats", "C~
## $ outcome     <chr> "Reclaimed", "Rehomed", "Other", "Euthanized", "Reclaimed"~
## $ ACT         <int> 610, 1245, 12, 360, 111, 1442, 0, 1007, 0, 1, 0, 0, 2, 90,~
## $ NSW         <int> 3140, 7525, 745, 9221, 201, 3913, 447, 8205, 0, 12, 0, 8, ~
## $ NT          <int> 205, 526, 955, 9, 22, 269, 0, 847, 1, 3, 0, 0, 0, 120, 0, ~
## $ QLD         <int> 1392, 5489, 860, 9214, 206, 3901, 386, 10554, 0, 3, 11, 1,~
## $ SA          <int> 2329, 1105, 380, 1701, 157, 1055, 46, 3415, 2, 10, 1, 0, 1~
## $ TAS         <int> 516, 480, 168, 599, 31, 752, 124, 1056, 1, 0, 2, 0, 1, 25,~
## $ VIC         <int> 7130, 4908, 1001, 5217, 884, 3768, 1501, 6113, 87, 19, 0, ~
## $ WA          <int> 1, 137, 6, 18, 0, 62, 5, 5, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, ~
## $ Total       <int> 15323, 21415, 4127, 26339, 1612, 15162, 2509, 31202, 91, 4~
datasummary_skim(animal_outcomes)
Unique (#) Missing (%) Mean SD Min Median Max
year 20 0 2009.0 5.6 1999.0 2009.0 2018.0
ACT 287 0 201.9 354.5 0.0 36.0 1628.0
NSW 363 0 970.6 2182.4 0.0 78.5 13267.0
NT 198 2 210.8 806.2 0.0 1.0 8150.0
QLD 422 0 1289.9 2562.5 0.0 198.5 15690.0
SA 308 0 318.3 662.2 0.0 34.5 4252.0
TAS 241 0 141.3 288.0 0.0 21.0 1974.0
VIC 371 0 992.3 2009.9 0.0 108.5 10567.0
WA 171 0 88.5 319.1 0.0 2.0 6035.0
Total 561 0 4199.4 7693.9 0.0 1015.0 42731.0
unique(animal_outcomes$animal_type)
## [1] "Dogs"          "Cats"          "Horses"        "Livestock"    
## [5] "Wildlife"      "Other Animals"
unique(animal_outcomes$outcome)
## [1] "Reclaimed"         "Rehomed"           "Other"            
## [4] "Euthanized"        "Released"          "In Stock"         
## [7] "Transferred"       "Currently In Care"
colnames(animal_outcomes)
##  [1] "year"        "animal_type" "outcome"     "ACT"         "NSW"        
##  [6] "NT"          "QLD"         "SA"          "TAS"         "VIC"        
## [11] "WA"          "Total"

Time Series of Animal Outcomes

The following visualization is showing how the animal outcomes have evolved over the years in Australia. This will give us an understanding of the trend and make the picture more clearer.

#Making an extra column of sum of aniamls in each outcome year wise 
outcome_yearly <- animal_outcomes[,.(ACT = sum(ACT),
      NSW = sum(NT),
      QLD = sum(QLD),
      SA = sum(SA),
      TAS = sum(TAS),
      VIC=sum(VIC),
      WA= sum(WA)),
   by=.(outcome= as.factor(outcome), year)]

outcome_yearly[, `:=`(SUM = rowSums(.SD, na.rm=T)), .SDcols=c("ACT","NSW","QLD",
                                                              "SA","TAS","VIC", "WA")]
outcome_yearly$outcome <- as.factor(outcome_yearly$outcome)

#Plotting

viz <- outcome_yearly %>%
  filter(year >= 2000 & year <= 2018) %>%
  ggplot() +
  aes(x = year, y = SUM, colour = outcome, group = outcome) +
  geom_line(size = 0.8) +
  scale_color_viridis_d(option = "plasma", direction = 1) +
  ggtitle("Time Series of Animal Outcome")+
  ylab("Number of Animals")+
  xlab("Year") + 
  theme_syshtu()
viz+transition_reveal(year)

Animal Category wise outcome

I made this visualization to see if there was any difference in the outcomes of different animals.

#Plotting the animal outcomes according to different animal categories 

outcome_yearly1 <- animal_outcomes[,.(ACT = sum(ACT),
                        NSW = sum(NT),
                        QLD = sum(QLD),
                        SA = sum(SA),
                        TAS = sum(TAS),
                        VIC=sum(VIC),
                        WA= sum(WA)),
                     by=.(outcome= as.factor(outcome), year, animal_type)]
outcome_yearly1[, `:=`(SUM = rowSums(.SD, na.rm=T)), .SDcols=c("ACT","NSW","QLD",
                                                              "SA","TAS","VIC", "WA")]

viz2 <- ggplot(outcome_yearly1) +
  aes(x = year, y = SUM, colour = outcome, group = outcome) +
  geom_line(size = 0.5) +
  scale_color_hue(direction = 1) +
  facet_wrap(vars(animal_type), scales = "free") +
  ggtitle("Outcomes of All the Animal Categories") +
  ylab("Number of Animals") + 
  theme_syshtu()

viz2

Box Plots for Each Animal according to the outcome they had

To further explore the outcomes according to the different animal categories I made these box plots. These will help us in exploring and understanding the distribution of the animal outcomes.

outcome_yearly1$outcome<- as.factor(outcome_yearly1$outcome)
viz3 <- ggplot(outcome_yearly1) +
  aes(x = outcome, y = SUM, fill = outcome) +
  geom_boxplot(shape = "circle") +
  scale_fill_hue(direction = 1) +
  facet_wrap(vars(animal_type), scales = "free_y") +
  theme_syshtu() +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + 
  theme(legend.position = "right") +
  ggtitle("Box Plot for All animal Categories") +
  ylab("Number of Animals") 
  
viz3+ transition_states(outcome)+
  shadow_mark(alpha=0.5)+
  enter_grow()+
  exit_fly()+
  ease_aes("back-out")

State wise animal outcomes

Rather than just looking at the data only country wise, I wanted to explore if there was a difference in how the animals were being treated in different states. For that I scraped the data like the population, area and postal codes, which will also be useful for the next visualizations. I only filtered to Cats and Dogs, since these are the most common pets and also to the year 2018, the latest available data.

states_territories <- read_html("https://en.wikipedia.org/wiki/States_and_territories_of_Australia") 
tables <- states_territories %>% html_table(fill = TRUE)


#Getting the desired tables from all the lists and converting them to data frames
states_table <- tables[[3]] 
states_table <- janitor::clean_names(states_table)

territories_table <- tables[[4]] 
territories_table <- janitor::clean_names(territories_table)



#Selecting only the required columns 
states <- states_table %>%
  select(region = state,
         code = postal,
         population = contains("population"),
         area = contains("area"))

territories <- territories_table %>%
  filter(territory != "Jervis Bay Territory") %>%
  select(region = territory,
         code = postal,
         population = contains("population"),
         area = contains("area"))


state_territory_data <- bind_rows(states, territories) %>%
  mutate(code = str_to_upper(code),
         population = readr::parse_number(population),
         area = readr::parse_number(area))

#Joining the animal outcomes data with state and territories data 
animal_outcomes_tidy <- animal_outcomes %>%
  pivot_longer(ACT:WA, names_to = "code", values_to = "n") %>%
  inner_join(state_territory_data, by = "code")


#Comparing Cat and Dog outcomes by region in 2018 
#I chose 2018 because it was the latest available year
viz4 <- animal_outcomes_tidy %>%
  filter(year == 2018, animal_type %in% c("Dogs", "Cats")) %>%
  mutate(per_capita_million = n / population * 1000000,
         region = fct_reorder(region, -n, sum),
         outcome = fct_reorder(outcome, n, sum)) %>%
  ggplot(aes(n, outcome, fill = animal_type)) +
  geom_col(position = "dodge") +
  facet_wrap(~ region, scales = "free_x") +
  labs(title = "Comparing cat and dog outcomes by region in 2018",
       x = "Number of animals",
       fill = "") + 
  theme_syshtu()

viz4

Euthanization Rate of Cats and Dogs

I think one of the most important outcome to explore is the euthanization percentage. So to explore that I made a new variable to reflect that. This visualization will show what percentage of cats and dogs were euthanized in each state, and the size of the points will also reflect the absolute number of the animals.

by_year_region <- animal_outcomes_tidy %>%
  filter(animal_type %in% c("Dogs", "Cats")) %>%
  filter(!is.na(n)) %>%
  group_by(year, animal_type, region, code) %>%
  summarize(n_animals = sum(n),
            pct_euthanized = sum(n[outcome == "Euthanized"]) / sum(n)) %>%
  ungroup()


 viz5 <-  by_year_region %>%
  mutate(region = fct_reorder(region, -n_animals, sum)) %>%
  ggplot(aes(year, pct_euthanized, color = animal_type)) +
  geom_line() +
  geom_point(aes(size = n_animals)) +
  facet_wrap(~ region) +
  scale_y_continuous(labels = percent) +
  labs(x = "Year",
       y = "% euthanized",
       size = "# of animals") +
  ggtitle("%age of Cats & Dogs Euthanized") 
 viz5+transition_reveal(year)

Euthanization Rate Map

I also wanted to show this information on a map. So, for that I first scraped the Longitudes and Latitudes of all the states, so that the points can be plotted. While the color of the gradient will reflect how high the euthanization percentage is.

 #Scraping the longitude and latitude for each state in Australia
 australia_states <- read_html("https://www.distancelatlong.com/country/australia") 
 tables_states <- australia_states %>% html_table(fill = TRUE)
 final_long_lat <- tables_states[[3]]
 
 final_long_lat <- as.data.table(final_long_lat)
 by_year_region <- as.data.table(by_year_region)

 
#Removing brackets from the values so that it can be joined with the other data table   
final_long_lat$States[final_long_lat$States == "Australian Capital Territory (2)"] <- "Australian Capital Territory"
final_long_lat$States[final_long_lat$States == "New South Wales (55)"] <- "New South Wales"
final_long_lat$States[final_long_lat$States == "Northern Territory (11)"] <- "Northern Territory"
final_long_lat$States[final_long_lat$States == "Queensland (53)"] <- "Queensland"
final_long_lat$States[final_long_lat$States == "South Australia (25)"] <- "South Australia"
final_long_lat$States[final_long_lat$States == "Tasmania (10)"] <- "Tasmania"
final_long_lat$States[final_long_lat$States == "Victoria (26)"] <- "Victoria"
final_long_lat$States[final_long_lat$States == "Western Australia (42)"] <- "Western Australia"


final_long_lat$region <- final_long_lat$States 
final_long_lat$States <- NULL 

#Joining the two data tables to get the longitudes and latitudes 
by_year_region_w_cord <- final_long_lat[by_year_region, on =.(region)]

fv6 <- by_year_region_w_cord[year==2018]

viz6 <- ggmap::qmplot(x = Longitude,y=Latitude, data = fv6, maptype = "terrain",
       geom = "point", color = pct_euthanized, size = 14, zoom = 5) +
  scale_color_gradient(low="blue",high="red")

viz6 + ggtitle("%age of Cats & Dogs Euthanized in 2018") + theme_syshtu()

Conclusion

We used some powerful packages to create some plots, including the animated ones, which helped us uncover some interesting findings. For instance we could see that the euthanization rate in almost all of the states is on a downward trend. We were able to drill down into the data through visulizations according to different categories, for instance on a state level and the animal categories.