preprocessing

library("readr")
library(sf)
library(tidyverse)
library(tmap)
library(ggplot2)
library(ggmapcn) # to use a Pacific centered map 

# load data into R 
whale_reference <- read_delim("Movements of Australia's east coast humpback whales-reference-data.csv") |> 
  mutate(`deploy-on-date` = as.POSIXct(`deploy-on-date`), 
         `deploy-off-date` = as.POSIXct(`deploy-off-date`))

# explore this data set
head(whale_reference)
# A tibble: 6 × 18
  `tag-id` `animal-id` `animal-taxon`    `deploy-on-date`    `deploy-off-date`  
     <dbl>       <dbl> <chr>             <dttm>              <dttm>             
1    88723       88723 Megaptera novaea… 2008-10-24 00:00:00 2009-01-13 23:59:59
2    88733       88733 Megaptera novaea… 2008-10-24 00:00:00 2008-12-04 23:59:59
3    88743       88743 Megaptera novaea… 2008-10-24 00:00:00 2008-11-05 23:59:59
4    88735       88735 Megaptera novaea… 2008-10-24 00:00:00 2008-12-01 23:59:59
5    88746       88746 Megaptera novaea… 2008-10-24 00:00:00 2008-11-12 23:59:59
6    88732       88732 Megaptera novaea… 2008-10-24 00:00:00 2008-11-27 23:59:59
# ℹ 13 more variables: `animal-group-id` <chr>, `animal-life-stage` <chr>,
#   `animal-sex` <chr>, `attachment-type` <chr>, `deploy-on-latitude` <dbl>,
#   `deploy-on-longitude` <dbl>, `deployment-id` <chr>, `duty-cycle` <chr>,
#   `manipulation-type` <chr>, `outlier-comments` <chr>,
#   `tag-manufacturer-name` <chr>, `tag-model` <chr>,
#   `tag-readout-method` <chr>
str(whale_reference)
tibble [50 × 18] (S3: tbl_df/tbl/data.frame)
 $ tag-id               : num [1:50] 88723 88733 88743 88735 88746 ...
 $ animal-id            : num [1:50] 88723 88733 88743 88735 88746 ...
 $ animal-taxon         : chr [1:50] "Megaptera novaeangliae" "Megaptera novaeangliae" "Megaptera novaeangliae" "Megaptera novaeangliae" ...
 $ deploy-on-date       : POSIXct[1:50], format: "2008-10-24" "2008-10-24" ...
 $ deploy-off-date      : POSIXct[1:50], format: "2009-01-13 23:59:59" "2008-12-04 23:59:59" ...
 $ animal-group-id      : chr [1:50] "breeding stock E1" "breeding stock E1" "breeding stock E1" "breeding stock E1" ...
 $ animal-life-stage    : chr [1:50] "Adult" "Adult" "Adult" "Subadult" ...
 $ animal-sex           : chr [1:50] "u" "f" "m" "m" ...
 $ attachment-type      : chr [1:50] "implant" "implant" "implant" "implant" ...
 $ deploy-on-latitude   : num [1:50] -37.1 -37.1 -37.1 -37.1 -37.2 ...
 $ deploy-on-longitude  : num [1:50] 150 150 150 150 150 ...
 $ deployment-id        : chr [1:50] "88723-88723" "88733-88733" "88743-88743" "88735-88735" ...
 $ duty-cycle           : chr [1:50] "30s repetition rate from 12:00-18:00 UTC" "30s repetition rate from 12:00-18:00 UTC" "30s repetition rate from 12:00-18:00 UTC" "30s repetition rate from 12:00-18:00 UTC" ...
 $ manipulation-type    : chr [1:50] "none" "none" "none" "none" ...
 $ outlier-comments     : chr [1:50] "Duplicate Filter applied in Movebank to non-modelled data with preference for higher quality Argos LC" "Duplicate Filter applied in Movebank to non-modelled data with preference for higher quality Argos LC" "Duplicate Filter applied in Movebank to non-modelled data with preference for higher quality Argos LC" "Duplicate Filter applied in Movebank to non-modelled data with preference for higher quality Argos LC" ...
 $ tag-manufacturer-name: chr [1:50] "Wildlife Computers" "Wildlife Computers" "Wildlife Computers" "Wildlife Computers" ...
 $ tag-model            : chr [1:50] "Cricket" "Cricket" "Cricket" "Cricket" ...
 $ tag-readout-method   : chr [1:50] "satellite" "satellite" "satellite" "satellite" ...
# create new file with important columns, discard other columns 

whale_reference_selected <- whale_reference |>
  select(-`tag-id`, -`animal-taxon`, -`animal-group-id`, -`animal-life-stage`, -`animal-sex`, -`attachment-type`, -`deployment-id`, -`manipulation-type`, -`outlier-comments`, -`tag-readout-method`)

# amend column names, to make later analysis easier 
names(whale_reference_selected) <- gsub("-", "_", names(whale_reference_selected))

colnames(whale_reference_selected)
[1] "animal_id"             "deploy_on_date"        "deploy_off_date"      
[4] "deploy_on_latitude"    "deploy_on_longitude"   "duty_cycle"           
[7] "tag_manufacturer_name" "tag_model"            
# load movement data into R 
whale_movement <- read_delim("Movements of Australia's east coast humpback whales.csv") |> 
  mutate(timestamp = as.POSIXct(timestamp))

# explore this data set
colnames(whale_movement)
 [1] "event-id"                        "visible"                        
 [3] "timestamp"                       "location-long"                  
 [5] "location-lat"                    "algorithm-marked-outlier"       
 [7] "argos:lat1"                      "argos:lat2"                     
 [9] "argos:lc"                        "argos:location-algorithm"       
[11] "argos:lon1"                      "argos:lon2"                     
[13] "comments"                        "modelled"                       
[15] "sensor-type"                     "individual-taxon-canonical-name"
[17] "tag-local-identifier"            "individual-local-identifier"    
[19] "study-name"                     
str(whale_movement)
tibble [30,388 × 19] (S3: tbl_df/tbl/data.frame)
 $ event-id                       : num [1:30388] 3.00e+10 3.01e+10 3.00e+10 3.01e+10 3.00e+10 ...
 $ visible                        : logi [1:30388] TRUE TRUE TRUE TRUE TRUE TRUE ...
 $ timestamp                      : POSIXct[1:30388], format: "2010-02-21 09:48:00" "2010-02-21 09:48:00" ...
 $ location-long                  : num [1:30388] 164 164 164 164 164 ...
 $ location-lat                   : num [1:30388] -66.9 -66.9 -66.9 -66.9 -66.9 ...
 $ algorithm-marked-outlier       : logi [1:30388] NA NA NA NA NA NA ...
 $ argos:lat1                     : num [1:30388] -66.9 NA -66.9 NA -66.9 ...
 $ argos:lat2                     : num [1:30388] -66.9 NA -66.9 NA -66.9 ...
 $ argos:lc                       : chr [1:30388] "1" NA "0" NA ...
 $ argos:location-algorithm       : chr [1:30388] "least squares" NA "least squares" NA ...
 $ argos:lon1                     : num [1:30388] 164 NA 164 NA 164 ...
 $ argos:lon2                     : num [1:30388] 164 NA 164 NA 164 ...
 $ comments                       : chr [1:30388] NA "state-space model location estimate (see Andrews-Goff et al. 2023, https://doi.org/10.3897/BDJ.11.e114729)" NA "state-space model location estimate (see Andrews-Goff et al. 2023, https://doi.org/10.3897/BDJ.11.e114729)" ...
 $ modelled                       : logi [1:30388] NA TRUE NA TRUE NA TRUE ...
 $ sensor-type                    : chr [1:30388] "argos-doppler-shift" "argos-doppler-shift" "argos-doppler-shift" "argos-doppler-shift" ...
 $ individual-taxon-canonical-name: chr [1:30388] "Megaptera novaeangliae" "Megaptera novaeangliae" "Megaptera novaeangliae" "Megaptera novaeangliae" ...
 $ tag-local-identifier           : num [1:30388] 53348 53348 53348 53348 53348 ...
 $ individual-local-identifier    : num [1:30388] 53348 53348 53348 53348 53348 ...
 $ study-name                     : chr [1:30388] "Movements of Australia's east coast humpback whales" "Movements of Australia's east coast humpback whales" "Movements of Australia's east coast humpback whales" "Movements of Australia's east coast humpback whales" ...
# Since preprocessing steps were conducted by the authors (Andrews-Goff et al. 2023), the modelled == TRUE locations are the positions estimated through the state-space model. 
# Hence I will clean the dataset and delete all unnecessary locations, which are NOT part of this model. 

whale_movement_modelled <- whale_movement |>
  filter(modelled == "TRUE")

# now I select the columns I need for data analysis. 
# first I check a few unique entries to ensure there is only one entry or less, before I make the selection to remove them. 

unique(whale_movement_modelled$visible)
[1] TRUE
unique(whale_movement_modelled$`algorithm-marked-outlier`)
[1] NA
unique(whale_movement_modelled$`argos:lat1`)
[1] NA
unique(whale_movement_modelled$`argos:lat2`)
[1] NA
unique(whale_movement_modelled$`argos:lc`)
[1] NA
unique(whale_movement_modelled$`argos:location-algorithm`)
[1] NA
unique(whale_movement_modelled$`argos:lon1`)
[1] NA
unique(whale_movement_modelled$`argos:lon2`)
[1] NA
unique(whale_movement_modelled$comments)
[1] "state-space model location estimate (see Andrews-Goff et al. 2023, https://doi.org/10.3897/BDJ.11.e114729)"
unique(whale_movement_modelled$modelled)
[1] TRUE
# all can be removed, since these only show one value and are not needed for further analysis. 

whale_movement_selected <- whale_movement_modelled |>
  select(`event-id`, timestamp, `location-long`, `location-lat`, `individual-local-identifier`) |>
  rename_with(~ gsub("-", "_", .x)) # change all "-" to "_"

# now I combine both selected datasets with a join, based on the animal_id or individual_local_identifier, which is both the same. 

whale_joined <- full_join(whale_movement_selected, whale_reference_selected, by = c("individual_local_identifier" = "animal_id"))

# check how many individuals were tagged 
length(unique(whale_joined$individual_local_identifier))
[1] 50
# check which duty cycles were used 
unique(whale_joined$duty_cycle)
[1] "30s repetition rate from 00:00-04:00 UTC and 12:00-16:00 UTC"
[2] "40s repetition rate from 00:00-06:00 UTC"                    
[3] "30s repetition rate from 12:00-18:00 UTC"                    
[4] "30s repetition rate from 08:00-12:00 UTC and 20:00-00:00 UTC"
# check for missing values 
colSums(is.na(whale_joined))
                   event_id                   timestamp 
                          2                           2 
              location_long                location_lat 
                          2                           2 
individual_local_identifier              deploy_on_date 
                          0                           0 
            deploy_off_date          deploy_on_latitude 
                          0                           0 
        deploy_on_longitude                  duty_cycle 
                          0                           0 
      tag_manufacturer_name                   tag_model 
                          0                         232 
whale_joined[
  is.na(whale_joined$event_id) |
    is.na(whale_joined$timestamp) | 
    is.na(whale_joined$location_long) |
    is.na(whale_joined$location_lat),
]
# A tibble: 2 × 12
  event_id timestamp location_long location_lat individual_local_identifier
     <dbl> <dttm>            <dbl>        <dbl>                       <dbl>
1       NA NA                   NA           NA                       88752
2       NA NA                   NA           NA                       53359
# ℹ 7 more variables: deploy_on_date <dttm>, deploy_off_date <dttm>,
#   deploy_on_latitude <dbl>, deploy_on_longitude <dbl>, duty_cycle <chr>,
#   tag_manufacturer_name <chr>, tag_model <chr>
which(
  is.na(whale_joined$event_id) |
    is.na(whale_joined$timestamp) | 
    is.na(whale_joined$location_long) |
    is.na(whale_joined$location_lat),
)
[1] 13312 13313
# missing values for event_id, timestamp, location_long, and location_lat are the same.
# as I don't have any information for movement, I delete these rows. 

whale_joined <- whale_joined |>
  filter(!is.na(event_id))

# double check for NA values.
colSums(is.na(whale_joined)) 
                   event_id                   timestamp 
                          0                           0 
              location_long                location_lat 
                          0                           0 
individual_local_identifier              deploy_on_date 
                          0                           0 
            deploy_off_date          deploy_on_latitude 
                          0                           0 
        deploy_on_longitude                  duty_cycle 
                          0                           0 
      tag_manufacturer_name                   tag_model 
                          0                         232 
# Only tag_model has 232 NA's, which is OK, since Sirtrack generally did not specify a tag model. 

# next I check if the timestamps are in order, or need to be ordered. 
whale_joined |>
  group_by(individual_local_identifier) |>
  summarise(
    is_ordered = all(diff(timestamp) > 0)
  )
# A tibble: 48 × 2
   individual_local_identifier is_ordered
                         <dbl> <lgl>     
 1                       53348 TRUE      
 2                       53376 TRUE      
 3                       53383 TRUE      
 4                       64235 TRUE      
 5                       64238 TRUE      
 6                       88717 TRUE      
 7                       88718 TRUE      
 8                       88723 TRUE      
 9                       88725 TRUE      
10                       88728 TRUE      
# ℹ 38 more rows
# check time gaps for each individual.

whale_joined |>
  group_by(individual_local_identifier) |>
  arrange(timestamp) |>
  mutate(time_gap_hours = as.numeric(difftime(timestamp, lag(timestamp), units = "hours"))) |>
  summarise(max_gap = max(time_gap_hours, na.rm = TRUE))
# A tibble: 48 × 2
   individual_local_identifier max_gap
                         <dbl>   <dbl>
 1                       53348   97.4 
 2                       53376    9.08
 3                       53383   34.3 
 4                       64235   24.2 
 5                       64238   22.6 
 6                       88717   23.8 
 7                       88718   42.6 
 8                       88723   47.2 
 9                       88725   43.9 
10                       88728   42.3 
# ℹ 38 more rows
# relatively large time gaps are noted. Thus, it may become difficult to calculate speed, turning angles, and behaviour classifications.
# however, if calculations shall be done, an interpolation may be suitable, but only for small gaps. e.g. less than 6-12h. 

# check my coordinates 
summary(whale_joined$location_lat)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 -70.02  -65.09  -62.37  -54.07  -41.42  -15.74 
summary(whale_joined$location_long)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 -180.0   149.7   156.8   151.0   164.1   180.0 
sum(whale_joined$location_lat == 0, na.rm = TRUE)
[1] 0
sum(whale_joined$location_long == 0, na.rm = TRUE)
[1] 0
# coordinates look fine 

# check for duplicate rows
sum(duplicated(whale_joined))
[1] 0
# no duplicates observed 

# check large sampling number per individual is
whale_joined |>
  count(individual_local_identifier) |>
  arrange(n)
# A tibble: 48 × 2
   individual_local_identifier     n
                         <dbl> <int>
 1                       98109     5
 2                       53383     7
 3                       88744    17
 4                       88750    23
 5                       96404    28
 6                       88755    38
 7                       64238    47
 8                       53376    48
 9                       88747    55
10                       88756    55
# ℹ 38 more rows
# change column name to keep it shorter
whale_joined <- whale_joined |>
  rename(individual_id = "individual_local_identifier")

# check the movements with ggplot 

ggplot(whale_joined, aes(x = location_long, y = location_lat)) +
  borders("world", colour = "gray70", fill = "gray95") +
  geom_path(aes(group = individual_id, color = factor(individual_id)), alpha = 0.7) +
  geom_point(size = 0.5) +
  coord_fixed(1.3) +
  theme_minimal()

# since the Pacific is not centered, it seems like points are far away, even though they are close. 
# I create an sf object, so that I can center the pacific. 

whale_sf <- st_as_sf(whale_joined, 
                     coords = c("location_long", "location_lat"),
                     crs = 4326)

whale_sf$individual_id <- as.factor(whale_sf$individual_id)

crs_robin_150 <- "+proj=robin +lon_0=150 +datum=WGS84"
whale_sf_robin <- st_transform(whale_sf, crs = crs_robin_150) # transform coordinates, so its in the same CRS 

ggplot() +
  geom_world(crs = crs_robin_150) +
  coord_sf(crs = crs_robin_150) +
  geom_sf(data = whale_sf_robin,
          aes(color = individual_id),
          size = 0.02, 
          alpha = 0.6) +
  theme_void() + 
  theme(legend.position = "none")

my dataset looks now good to work with. I export the file as csv.

write.csv(whale_joined, file="my_tracks.csv")