library("readr")library(sf)library(tidyverse)library(tmap)library(ggplot2)library(ggmapcn) # to use a Pacific centered map # load data into R whale_reference <-read_delim("Movements of Australia's east coast humpback whales-reference-data.csv") |>mutate(`deploy-on-date`=as.POSIXct(`deploy-on-date`), `deploy-off-date`=as.POSIXct(`deploy-off-date`))# explore this data sethead(whale_reference)
tibble [50 × 18] (S3: tbl_df/tbl/data.frame)
$ tag-id : num [1:50] 88723 88733 88743 88735 88746 ...
$ animal-id : num [1:50] 88723 88733 88743 88735 88746 ...
$ animal-taxon : chr [1:50] "Megaptera novaeangliae" "Megaptera novaeangliae" "Megaptera novaeangliae" "Megaptera novaeangliae" ...
$ deploy-on-date : POSIXct[1:50], format: "2008-10-24" "2008-10-24" ...
$ deploy-off-date : POSIXct[1:50], format: "2009-01-13 23:59:59" "2008-12-04 23:59:59" ...
$ animal-group-id : chr [1:50] "breeding stock E1" "breeding stock E1" "breeding stock E1" "breeding stock E1" ...
$ animal-life-stage : chr [1:50] "Adult" "Adult" "Adult" "Subadult" ...
$ animal-sex : chr [1:50] "u" "f" "m" "m" ...
$ attachment-type : chr [1:50] "implant" "implant" "implant" "implant" ...
$ deploy-on-latitude : num [1:50] -37.1 -37.1 -37.1 -37.1 -37.2 ...
$ deploy-on-longitude : num [1:50] 150 150 150 150 150 ...
$ deployment-id : chr [1:50] "88723-88723" "88733-88733" "88743-88743" "88735-88735" ...
$ duty-cycle : chr [1:50] "30s repetition rate from 12:00-18:00 UTC" "30s repetition rate from 12:00-18:00 UTC" "30s repetition rate from 12:00-18:00 UTC" "30s repetition rate from 12:00-18:00 UTC" ...
$ manipulation-type : chr [1:50] "none" "none" "none" "none" ...
$ outlier-comments : chr [1:50] "Duplicate Filter applied in Movebank to non-modelled data with preference for higher quality Argos LC" "Duplicate Filter applied in Movebank to non-modelled data with preference for higher quality Argos LC" "Duplicate Filter applied in Movebank to non-modelled data with preference for higher quality Argos LC" "Duplicate Filter applied in Movebank to non-modelled data with preference for higher quality Argos LC" ...
$ tag-manufacturer-name: chr [1:50] "Wildlife Computers" "Wildlife Computers" "Wildlife Computers" "Wildlife Computers" ...
$ tag-model : chr [1:50] "Cricket" "Cricket" "Cricket" "Cricket" ...
$ tag-readout-method : chr [1:50] "satellite" "satellite" "satellite" "satellite" ...
# create new file with important columns, discard other columns whale_reference_selected <- whale_reference |>select(-`tag-id`, -`animal-taxon`, -`animal-group-id`, -`animal-life-stage`, -`animal-sex`, -`attachment-type`, -`deployment-id`, -`manipulation-type`, -`outlier-comments`, -`tag-readout-method`)# amend column names, to make later analysis easier names(whale_reference_selected) <-gsub("-", "_", names(whale_reference_selected))colnames(whale_reference_selected)
# load movement data into R whale_movement <-read_delim("Movements of Australia's east coast humpback whales.csv") |>mutate(timestamp =as.POSIXct(timestamp))# explore this data setcolnames(whale_movement)
tibble [30,388 × 19] (S3: tbl_df/tbl/data.frame)
$ event-id : num [1:30388] 3.00e+10 3.01e+10 3.00e+10 3.01e+10 3.00e+10 ...
$ visible : logi [1:30388] TRUE TRUE TRUE TRUE TRUE TRUE ...
$ timestamp : POSIXct[1:30388], format: "2010-02-21 09:48:00" "2010-02-21 09:48:00" ...
$ location-long : num [1:30388] 164 164 164 164 164 ...
$ location-lat : num [1:30388] -66.9 -66.9 -66.9 -66.9 -66.9 ...
$ algorithm-marked-outlier : logi [1:30388] NA NA NA NA NA NA ...
$ argos:lat1 : num [1:30388] -66.9 NA -66.9 NA -66.9 ...
$ argos:lat2 : num [1:30388] -66.9 NA -66.9 NA -66.9 ...
$ argos:lc : chr [1:30388] "1" NA "0" NA ...
$ argos:location-algorithm : chr [1:30388] "least squares" NA "least squares" NA ...
$ argos:lon1 : num [1:30388] 164 NA 164 NA 164 ...
$ argos:lon2 : num [1:30388] 164 NA 164 NA 164 ...
$ comments : chr [1:30388] NA "state-space model location estimate (see Andrews-Goff et al. 2023, https://doi.org/10.3897/BDJ.11.e114729)" NA "state-space model location estimate (see Andrews-Goff et al. 2023, https://doi.org/10.3897/BDJ.11.e114729)" ...
$ modelled : logi [1:30388] NA TRUE NA TRUE NA TRUE ...
$ sensor-type : chr [1:30388] "argos-doppler-shift" "argos-doppler-shift" "argos-doppler-shift" "argos-doppler-shift" ...
$ individual-taxon-canonical-name: chr [1:30388] "Megaptera novaeangliae" "Megaptera novaeangliae" "Megaptera novaeangliae" "Megaptera novaeangliae" ...
$ tag-local-identifier : num [1:30388] 53348 53348 53348 53348 53348 ...
$ individual-local-identifier : num [1:30388] 53348 53348 53348 53348 53348 ...
$ study-name : chr [1:30388] "Movements of Australia's east coast humpback whales" "Movements of Australia's east coast humpback whales" "Movements of Australia's east coast humpback whales" "Movements of Australia's east coast humpback whales" ...
# Since preprocessing steps were conducted by the authors (Andrews-Goff et al. 2023), the modelled == TRUE locations are the positions estimated through the state-space model. # Hence I will clean the dataset and delete all unnecessary locations, which are NOT part of this model. whale_movement_modelled <- whale_movement |>filter(modelled =="TRUE")# now I select the columns I need for data analysis. # first I check a few unique entries to ensure there is only one entry or less, before I make the selection to remove them. unique(whale_movement_modelled$visible)
[1] "state-space model location estimate (see Andrews-Goff et al. 2023, https://doi.org/10.3897/BDJ.11.e114729)"
unique(whale_movement_modelled$modelled)
[1] TRUE
# all can be removed, since these only show one value and are not needed for further analysis. whale_movement_selected <- whale_movement_modelled |>select(`event-id`, timestamp, `location-long`, `location-lat`, `individual-local-identifier`) |>rename_with(~gsub("-", "_", .x)) # change all "-" to "_"# now I combine both selected datasets with a join, based on the animal_id or individual_local_identifier, which is both the same. whale_joined <-full_join(whale_movement_selected, whale_reference_selected, by =c("individual_local_identifier"="animal_id"))# check how many individuals were tagged length(unique(whale_joined$individual_local_identifier))
[1] 50
# check which duty cycles were used unique(whale_joined$duty_cycle)
[1] "30s repetition rate from 00:00-04:00 UTC and 12:00-16:00 UTC"
[2] "40s repetition rate from 00:00-06:00 UTC"
[3] "30s repetition rate from 12:00-18:00 UTC"
[4] "30s repetition rate from 08:00-12:00 UTC and 20:00-00:00 UTC"
# check for missing values colSums(is.na(whale_joined))
# A tibble: 2 × 12
event_id timestamp location_long location_lat individual_local_identifier
<dbl> <dttm> <dbl> <dbl> <dbl>
1 NA NA NA NA 88752
2 NA NA NA NA 53359
# ℹ 7 more variables: deploy_on_date <dttm>, deploy_off_date <dttm>,
# deploy_on_latitude <dbl>, deploy_on_longitude <dbl>, duty_cycle <chr>,
# tag_manufacturer_name <chr>, tag_model <chr>
# missing values for event_id, timestamp, location_long, and location_lat are the same.# as I don't have any information for movement, I delete these rows. whale_joined <- whale_joined |>filter(!is.na(event_id))# double check for NA values.colSums(is.na(whale_joined))
# Only tag_model has 232 NA's, which is OK, since Sirtrack generally did not specify a tag model. # next I check if the timestamps are in order, or need to be ordered. whale_joined |>group_by(individual_local_identifier) |>summarise(is_ordered =all(diff(timestamp) >0) )
# check time gaps for each individual.whale_joined |>group_by(individual_local_identifier) |>arrange(timestamp) |>mutate(time_gap_hours =as.numeric(difftime(timestamp, lag(timestamp), units ="hours"))) |>summarise(max_gap =max(time_gap_hours, na.rm =TRUE))
# relatively large time gaps are noted. Thus, it may become difficult to calculate speed, turning angles, and behaviour classifications.# however, if calculations shall be done, an interpolation may be suitable, but only for small gaps. e.g. less than 6-12h. # check my coordinates summary(whale_joined$location_lat)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-70.02 -65.09 -62.37 -54.07 -41.42 -15.74
summary(whale_joined$location_long)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-180.0 149.7 156.8 151.0 164.1 180.0
sum(whale_joined$location_lat ==0, na.rm =TRUE)
[1] 0
sum(whale_joined$location_long ==0, na.rm =TRUE)
[1] 0
# coordinates look fine # check for duplicate rowssum(duplicated(whale_joined))
[1] 0
# no duplicates observed # check large sampling number per individual iswhale_joined |>count(individual_local_identifier) |>arrange(n)
# change column name to keep it shorterwhale_joined <- whale_joined |>rename(individual_id ="individual_local_identifier")# check the movements with ggplot ggplot(whale_joined, aes(x = location_long, y = location_lat)) +borders("world", colour ="gray70", fill ="gray95") +geom_path(aes(group = individual_id, color =factor(individual_id)), alpha =0.7) +geom_point(size =0.5) +coord_fixed(1.3) +theme_minimal()
# since the Pacific is not centered, it seems like points are far away, even though they are close. # I create an sf object, so that I can center the pacific. whale_sf <-st_as_sf(whale_joined, coords =c("location_long", "location_lat"),crs =4326)whale_sf$individual_id <-as.factor(whale_sf$individual_id)crs_robin_150 <-"+proj=robin +lon_0=150 +datum=WGS84"whale_sf_robin <-st_transform(whale_sf, crs = crs_robin_150) # transform coordinates, so its in the same CRS ggplot() +geom_world(crs = crs_robin_150) +coord_sf(crs = crs_robin_150) +geom_sf(data = whale_sf_robin,aes(color = individual_id),size =0.02, alpha =0.6) +theme_void() +theme(legend.position ="none")
my dataset looks now good to work with. I export the file as csv.