Introduction
For the purpose of this project I chose the Animal Rescue data from TidyTuesday. The data shows the rescues made by the London Fire Brigade from 2009 to 2021 along with the cost of each rescue operation. I first cleaned the data and then moved on to creating 6 visualizations.
Library and Loading Data
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
df <- fread ('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-06-29/animal_rescues.csv')
postcode <- fread('/Users/maryamkhan/Downloads/Postcode districts.csv')Data Cleaning
# Data Cleaning
is.na(df) <- df == "NULL"
df$date <- as.Date(df$date_time_of_call, format = "%d/%m/%Y")
df <- df[, longitude:=as.numeric(longitude)]
df <- df[, latitude:=as.numeric(latitude)]
df <- df[, animal_group_parent:=factor(animal_group_parent)]
df <- df[, animal_group_parent:= mapvalues(animal_group_parent, from=c("cat", "Budgie", "Pigeon", "Lamb", "Bull" ), to=c("Cat", "Bird", "Bird", "Sheep", "Cow"))]
to_filter <- sort(sapply(df$longitude, function(x) sum(is.na(x))))
#to_filter[to_filter > 0]
#sum(!is.na(df$postcode_district))/nrow(df) # due to a lot missing values we will not be using this
postcode$Easting <- NULL
postcode$Northing <- NULL
postcode$`Grid Reference` <- NULL
postcode$`Town/Area` <- NULL
postcode$Region <- NULL
postcode$`Active postcodes` <- NULL
postcode$Postcodes <- NULL
postcode$Population <- NULL
postcode$Households <- NULL
postcode$`Nearby districts` <- NULL
postcode$`UK region` <- NULL
setnames(postcode, "Postcode", "postcode_district")
# Merging lat and long with dataset
dt <- merge(x = df, y = postcode, by = "postcode_district", all.x = TRUE)
# combine all animals to others
dt$animal_group_parent <- as.factor(dt$animal_group_parent)
dt$animal_group_parent <- gsub("Unknown - Wild Animal","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Unknown - Domestic Animal Or Pet","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Unknown - Animal rescue from water - Farm animal","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Unknown - Animal rescue from below ground - Farm animal","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Lizard","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Squirrel","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Rabbit","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Hamster","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Hedgehog","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Snake","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Horse","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Tortoise","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Sheep","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Cow","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Ferret","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Fish","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Goat","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Deer","Other",dt$animal_group_parent)
dt$animal_group_parent <- gsub("Unknown - Heavy Livestock Animal","Other",dt$animal_group_parent)Animal rescue distribution by animal type in London
The first plot shows maps for each animal type. It can be seen that cats were the most common animal to be rescued followed by dogs and birds. Fox were mostly rescued from the outskirts of the city.
# Graph 1
# Data Prep
dt_geo <- st_as_sf(dt, coords = c("Longitude", "Latitude"))
# Using polygons to plot
pc_geo <- read_sf("https://raw.githubusercontent.com/sjwhitworth/london_geojson/master/london_postcodes.json")
#plot(pc_geo["Name"])
setnames(pc_geo, "Name", "postcode_district")
# Joining the data set with polygons
dt_geo2 <- merge(x = dt, y = pc_geo, by = "postcode_district" , all.x = TRUE)
dt_geo2 <- st_as_sf(dt_geo2, coords = c("Longitude", "Latitude"))
# Grouping by postcode and animal
dt_geo_sum <- dt[, (count = .N), by = c('postcode_district', 'animal_group_parent')]
dt_geo_sum2 <- merge(x = dt_geo_sum, y = pc_geo, by = "postcode_district" , all.y = TRUE)
dt_geo_sum2 <- st_as_sf(dt_geo_sum2)
# Removing NAs
row.has.na <- apply(dt_geo_sum2, 1, function(x){any(is.na(x))})
sum(row.has.na)## [1] 14
dt_geo_sum2 <- dt_geo_sum2[!row.has.na,]
## Plotting map
# Animal rescue in London
graph1 <- ggplot(dt_geo_sum2,aes(fill = V1)) +
geom_sf(color = NA) +
theme_map() +
theme(legend.position = "right" )+
labs(title = "Animal rescues in London") +
scale_fill_viridis_b(name = "Rescue") +
facet_wrap(vars(animal_group_parent))
graph1Most common animal by area
This plot shows the most common animal by area. Cats dominate as they were rescued the most. Birds and other animals were the common to be rescued from the outskirts of the city.
#Graph2
# Most common animals
# Grouping by postcode and animal
dt_geo_sum3 <- dt[ , (count = .N), by = c("postcode_district", "animal_group_parent")]
# ordering the data by postcode
dt_geo_sum3 <-dt_geo_sum3[order(dt_geo_sum3$postcode_district,-dt_geo_sum3$V1),]
# filtering for max value
dt_geo_sum3 <- dt_geo_sum3[dt_geo_sum3[, .I[V1 == max(V1)], by=postcode_district]$V1]
# joining the data with codes
dt_geo_sum3 <- merge(x = dt_geo_sum3, y = pc_geo, by = "postcode_district" , all.y = TRUE)
# Removing NA
row.has.na <- apply(dt_geo_sum3, 1, function(x){any(is.na(x))})
sum(row.has.na)## [1] 14
dt_geo_sum3 <- dt_geo_sum3[!row.has.na,]
dt_geo_sum3 <- st_as_sf(dt_geo_sum3)
graph2 <- ggplot(dt_geo_sum3) +
aes(
fill = animal_group_parent,
colour = animal_group_parent
) +
geom_sf(shape = "circle", size = 1.2) +
scale_fill_manual(
values = c(Bird = "#440154",
Cat = "#3A518A",
Dog = "#22908B",
Fox = "#5FC760",
Other = "#FDE725")
) +
scale_color_manual(
values = c(Bird = "#440154",
Cat = "#3A518A",
Dog = "#22908B",
Fox = "#5FC760",
Other = "#FDE725")
) +
labs(title = "Most common animal rescued by area") +
ggthemes::theme_map() +
theme(legend.position = "left")
graph2Total rescues over the years
This graph shows the rescues for every year from 2009 to 2021 for all animals. It can be seen that 2020 had the highest number of rescues 758 and 2021 had the lowest number of rescues 319.
# Graph 3
## Data Prep
# separate year from date to group it
dt$year <- year(ymd(dt$date))
dt$month <- month(ymd(dt$date))
# group data for year
dt_year <- dt[, (count = .N), by = c('year', 'animal_group_parent')]
dt_year_num <- dt_year[,.(Count=round(sum(V1)),Number= .N), by = year]
# Combining data to show total number of rescues
dt_year_count <- dt[, (count = .N), by = c('year')]
dt_year_count$year <- as.factor(dt_year_count$year)
# Plotting
graph3 <- ggplot(dt_year) +
aes(
x = animal_group_parent,
fill = animal_group_parent,
weight = V1
) +
geom_bar() +
scale_fill_viridis_d(option = "viridis", direction = 1) +
labs(x = "Animals", y = "Rescue Count") +
coord_flip() +
ggthemes::theme_base() +
theme(legend.position = "none")+
transition_states(year) +
labs(title = paste("Total Rescues in {closest_state}"),
subtitle = paste('Number of Rescues: {filter(dt_year_count, year == closest_state) %>% pull(V1)}' ))
graph3National Cost of Rescues
This graph shows a timeseries data for the national cost of all animal rescues made over time. 2016 was the most costly in terms of rescues followed by 2010 and 2020.
# Graph 4
## Data prep
dt$incident_notional_cost <- as.numeric(dt$incident_notional_cost)
dt_cost_sum <- dt[,sum(incident_notional_cost), by = .(year, animal_group_parent)]
dt_cost_sum[is.na(dt_cost_sum)] <- 0
## Plotting
graph4 <- ggplot(dt_cost_sum) +
aes(
x = year,
y = V1,
colour = animal_group_parent,
group = animal_group_parent
) +
geom_line(size = 2) +
scale_color_manual(
values = c(Bird = "#440154",
Cat = "#3A518A",
Dog = "#22908B",
Fox = "#5FC760",
Other = "#FDE725")
) +
labs(
x = "Year",
y = "Cost",
title = "Cost of Animal Rescues Over Time",
color = "Animals"
) +
theme_base() +
theme(legend.position = "bottom")+
transition_reveal(year) +
view_follow(fixed_y = TRUE)
graph4 Animal Rescue Violin & Box plot
The violin plot and box plot show the distribution of the data for each animal based on the first quartile (Q1), median and third quartile (Q3).
# Graph 5
graph5 <- ggplot(dt_year) +
aes(x = animal_group_parent, y = V1, fill = animal_group_parent) +
geom_boxplot(shape = "circle") +
geom_violin(alpha = 0.3)+
scale_fill_viridis_d(option = "viridis", direction = 1) +
labs(x= "Animal", y = "Rescues", title = "Animal Rescue Violin & Box Plot") +
theme_base() +
theme(legend.position = "none")
graph5Animal Density Chart
This shows the numeric distribution of animals.
# Graph 6
dt$year <- as.factor(dt$year)
dt$week <- dt[,strftime(dt$date, format = "%V")]
dt_month <- dt[, (count = .N), by = c('month', 'animal_group_parent')]
dt_2020 <- subset(dt, substr(year, 1, 5) >= '2020')
dt_month2020 <- dt_2020[, (count = .N), by = c('year','month', 'animal_group_parent')]
# Plotting
graph6 <- ggplot(dt_month2020) +
aes(x = V1, fill = animal_group_parent) +
geom_density(adjust = 1L) +
scale_fill_viridis_d(option = "viridis", direction = 1) +
labs(x = "Rescues", title = "Animal Density Chart ", fill = "Animals") +
ggthemes::theme_base() +
theme(legend.position = "bottom") +
facet_wrap(vars(animal_group_parent))
graph6Conclusion
R contains a lot of powerful packages to create aesthetically pleasing visualizations and animations.It gives you a lot of room to play around with colours, themes and animations. However, not all plots and graphs require animations.