Data_Exploratory_GEO880

Author

Group 6 - Robin Merz

Setup

#clear working space
  rm(list=ls())

#Import necessary libraries
  library(readr)
  library(dplyr)
  library(stringr)
  library(tidyverse)
  library(lubridate)
  library(scales)
  library(ggtext)
  library(ggplot2)

#Import Data
  Nutcracker <- read_csv("00_data/NuCra_Davos_all_data_2025-02-07_V2.csv")

# remove outliers
  remove_ids <- c(48325, 15381, 15382, 15383, 45363)
  Nutcracker <- Nutcracker |> filter(!...1 %in% remove_ids)

Data structure

#Data structure
  str(Nutcracker)
spc_tbl_ [49,758 × 34] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ ...1                   : num [1:49758] 1 2 3 4 5 6 7 8 9 10 ...
 $ id                     : chr [1:49758] ".458" ".458" ".458" ".458" ...
 $ hdop                   : num [1:49758] 1.7 2.8 1.9 1.1 1.4 1.3 1.2 3.6 3.1 3 ...
 $ longitude              : num [1:49758] 9.91 9.91 9.91 9.87 9.89 ...
 $ latitude               : num [1:49758] 46.8 46.8 46.8 46.8 46.8 ...
 $ altitude               : num [1:49758] 2073 2072 2071 1998 1998 ...
 $ satellites             : chr [1:49758] "6/6" "5/5" "6/6" "6/7" ...
 $ datetime               : POSIXct[1:49758], format: "2022-08-02 11:00:10" "2022-08-02 12:00:11" ...
 $ tag.type               : chr [1:49758] "vhf" "vhf" "vhf" "vhf" ...
 $ year                   : num [1:49758] 2022 2022 2022 2022 2022 ...
 $ month                  : chr [1:49758] "08" "08" "08" "08" ...
 $ season                 : chr [1:49758] "summer" "summer" "summer" "summer" ...
 $ brutzeit               : chr [1:49758] "no" "no" "no" "no" ...
 $ ring_no                : chr [1:49758] "K125908" "K125908" "K125908" "K125908" ...
 $ bag                    : num [1:49758] 26 26 26 26 26 26 26 26 26 26 ...
 $ bird_and_bag           : num [1:49758] 233 233 233 233 233 233 233 233 233 233 ...
 $ weight                 : num [1:49758] 207 207 207 207 207 207 207 207 207 207 ...
 $ wing_length            : num [1:49758] 191 191 191 191 191 191 191 191 191 191 ...
 $ bill_depth             : num [1:49758] 16.3 16.3 16.3 16.3 16.3 ...
 $ bill_length            : num [1:49758] 46.4 46.4 46.4 46.4 46.4 ...
 $ tarsus_length          : num [1:49758] 42 42 42 42 42 ...
 $ feathers               : chr [1:49758] "yes" "yes" "yes" "yes" ...
 $ stage.at.capture       : chr [1:49758] "juvenile" "juvenile" "juvenile" "juvenile" ...
 $ photo                  : num [1:49758] 0 0 0 0 0 0 0 0 0 0 ...
 $ datetime.at.capture    : POSIXct[1:49758], format: "2022-08-02 09:36:00" "2022-08-02 09:36:00" ...
 $ timediff               : num [1:49758] 0 -1 -2 -8 -10 ...
 $ steplength             : num [1:49758] 0 19.4 26.1 2716.8 1490.7 ...
 $ stepsize.from.last.hour: num [1:49758] 0 19.4 0 0 0 ...
 $ stage.current          : chr [1:49758] "juvenile" "juvenile" "juvenile" "juvenile" ...
 $ id.stage               : chr [1:49758] ".458.juvenile" ".458.juvenile" ".458.juvenile" ".458.juvenile" ...
 $ date                   : Date[1:49758], format: "2022-08-02" "2022-08-02" ...
 $ ndays.new              : num [1:49758] 33 33 33 33 33 33 33 33 33 33 ...
 $ n.datapoints           : num [1:49758] 183 183 183 183 183 183 183 183 183 183 ...
 $ dates.spanned.per.year : num [1:49758] 33 33 33 33 33 33 33 33 33 33 ...
 - attr(*, "spec")=
  .. cols(
  ..   ...1 = col_double(),
  ..   id = col_character(),
  ..   hdop = col_double(),
  ..   longitude = col_double(),
  ..   latitude = col_double(),
  ..   altitude = col_double(),
  ..   satellites = col_character(),
  ..   datetime = col_datetime(format = ""),
  ..   tag.type = col_character(),
  ..   year = col_double(),
  ..   month = col_character(),
  ..   season = col_character(),
  ..   brutzeit = col_character(),
  ..   ring_no = col_character(),
  ..   bag = col_double(),
  ..   bird_and_bag = col_double(),
  ..   weight = col_double(),
  ..   wing_length = col_double(),
  ..   bill_depth = col_double(),
  ..   bill_length = col_double(),
  ..   tarsus_length = col_double(),
  ..   feathers = col_character(),
  ..   stage.at.capture = col_character(),
  ..   photo = col_double(),
  ..   datetime.at.capture = col_datetime(format = ""),
  ..   timediff = col_double(),
  ..   steplength = col_double(),
  ..   stepsize.from.last.hour = col_double(),
  ..   stage.current = col_character(),
  ..   id.stage = col_character(),
  ..   date = col_date(format = ""),
  ..   ndays.new = col_double(),
  ..   n.datapoints = col_double(),
  ..   dates.spanned.per.year = col_double()
  .. )
 - attr(*, "problems")=<externalptr> 
#Number and names of variables
  length(Nutcracker)
[1] 34
  names(Nutcracker)
 [1] "...1"                    "id"                     
 [3] "hdop"                    "longitude"              
 [5] "latitude"                "altitude"               
 [7] "satellites"              "datetime"               
 [9] "tag.type"                "year"                   
[11] "month"                   "season"                 
[13] "brutzeit"                "ring_no"                
[15] "bag"                     "bird_and_bag"           
[17] "weight"                  "wing_length"            
[19] "bill_depth"              "bill_length"            
[21] "tarsus_length"           "feathers"               
[23] "stage.at.capture"        "photo"                  
[25] "datetime.at.capture"     "timediff"               
[27] "steplength"              "stepsize.from.last.hour"
[29] "stage.current"           "id.stage"               
[31] "date"                    "ndays.new"              
[33] "n.datapoints"            "dates.spanned.per.year" 
#number of birds
  unique(Nutcracker$id)
  [1] ".458" ".468" "0500" "0504" "0516" "0519" "061c" "062b" "0630" "0633"
 [11] "063a" "063b" "19"   "20"   "21"   "22"   "24"   "25"   "27"   "28"  
 [21] "29"   "30"   "31"   "32"   "33"   "36"   "37"   "5449" "5450" "5451"
 [31] "5454" "5995" "5996" "5997" "5998" "5999" "6000" "6001" "6002" "6003"
 [41] "6004" "6005" "6006" "6447" "6448" "6449" "6450" "6451" "6452" "6453"
 [51] "6454" "6455" "6456" "6457" "6458" "6459" "6460" "6461" "6462" "6463"
 [61] "6464" "6465" "6466" "6467" "6468" "6522" "6523" "6524" "6525" "6526"
 [71] "6527" "6528" "7314" "7315" "7316" "7317" "7318" "7319" "7320" "7321"
 [81] "7322" "7323" "7324" "7325" "7326" "7327" "7328" "7329" "7330" "7331"
 [91] "7332" "7930" "7931" "7932" "7933" "7934" "7935" "7936" "7937" "7938"
[101] "7939" "7940" "7941" "7942" "7943" "7944" "7945" "7946" "7947" "7948"
[111] "7949" "7950" "85"   "88"   "89"  

About this chunk:

  • This dataset contains 34 variables, including: Bird_ID, individual metadata, GPS locations and additional information such as whether a datapoint is collected during breeding season or not.

  • The dataset contains 115 individuals

Number of data points

#total
  nrow(Nutcracker)
[1] 49758
#per individual
  counts <- table(Nutcracker$id)
  
  # extract names
  ids <- names(counts)
  values <- as.vector(counts)
  n <- length(ids)
  cols <- 3
  rows <- ceiling(n / cols)
  
  #create matrix
  mat <- matrix(NA, nrow = rows, ncol = cols)
  for (i in 1:n) {
    row <- ((i - 1) %% rows) + 1
    col <- ((i - 1) %/% rows) + 1
    mat[row, col] <- sprintf("**%s**: %d", ids[i], values[i])
  }
  
  #print
  apply(mat, 1, function(row) {
    cat(paste(ifelse(is.na(row), "", row), collapse = "  |  "), "\n")
  })
**.458**: 183  |  **6003**: 1134  |  **7320**: 216 
**.468**: 282  |  **6004**: 642  |  **7321**: 72 
**0500**: 197  |  **6005**: 189  |  **7322**: 244 
**0504**: 660  |  **6006**: 60  |  **7323**: 325 
**0516**: 336  |  **6447**: 724  |  **7324**: 201 
**0519**: 187  |  **6448**: 784  |  **7325**: 118 
**061c**: 109  |  **6449**: 137  |  **7326**: 247 
**062b**: 114  |  **6450**: 10  |  **7327**: 241 
**0630**: 15  |  **6451**: 248  |  **7328**: 259 
**0633**: 190  |  **6452**: 906  |  **7329**: 1132 
**063a**: 37  |  **6453**: 2735  |  **7330**: 582 
**063b**: 5  |  **6454**: 592  |  **7331**: 288 
**19**: 924  |  **6455**: 1966  |  **7332**: 156 
**20**: 405  |  **6456**: 1085  |  **7930**: 3 
**21**: 377  |  **6457**: 127  |  **7931**: 45 
**22**: 705  |  **6458**: 561  |  **7932**: 38 
**24**: 981  |  **6459**: 478  |  **7933**: 57 
**25**: 106  |  **6460**: 1654  |  **7934**: 306 
**27**: 392  |  **6461**: 336  |  **7935**: 331 
**28**: 1100  |  **6462**: 937  |  **7936**: 7 
**29**: 35  |  **6463**: 160  |  **7937**: 35 
**30**: 64  |  **6464**: 126  |  **7938**: 112 
**31**: 37  |  **6465**: 2039  |  **7939**: 86 
**32**: 264  |  **6466**: 81  |  **7940**: 103 
**33**: 526  |  **6467**: 828  |  **7941**: 41 
**36**: 79  |  **6468**: 196  |  **7942**: 77 
**37**: 231  |  **6522**: 1199  |  **7943**: 20 
**5449**: 13  |  **6523**: 1  |  **7944**: 309 
**5450**: 616  |  **6524**: 858  |  **7945**: 34 
**5451**: 123  |  **6525**: 2436  |  **7946**: 37 
**5454**: 45  |  **6526**: 166  |  **7947**: 152 
**5995**: 166  |  **6527**: 774  |  **7948**: 59 
**5996**: 63  |  **6528**: 1396  |  **7949**: 226 
**5997**: 12  |  **7314**: 800  |  **7950**: 3 
**5998**: 109  |  **7315**: 1070  |  **85**: 984 
**5999**: 736  |  **7316**: 951  |  **88**: 212 
**6000**: 127  |  **7317**: 227  |  **89**: 188 
**6001**: 2224  |  **7318**: 86  |   
**6002**: 640  |  **7319**: 98  |   
NULL
#per year
  Nutcracker |>
    count(year) |>
    ggplot(aes(x = year, y = n)) +
    geom_col(fill = "orange") +
    labs(title = "Number of datapoints per year", x = "year", y = "counts") +
    theme_minimal()

#per month
  Nutcracker |>
    count(month) |>
    ggplot(aes(x = month, y = n)) +
    geom_col(fill = "orange") +
    labs(title = "Number of datapoints per month", x = "month", y = "counts") +
    theme_minimal()

#per month and year
  Nutcracker |>
    mutate(month = factor(month, levels = sprintf("%02d", 1:12))) |>
    count(year, month) |>
    ggplot(aes(x = month, y = n, fill = as.factor(year))) +
    geom_col(position = "dodge") +
    labs(title = "Number datapoints per year and month", x = "month", y = "counts", fill = "Year") +
    theme_minimal()

About this chunk:

  • In total 49758 datapoints

  • Highly variable number of datapoints per individual

  • Highly variable number of datapoints per year & month

Age classes and time spans

# Ageclasses
zeitspanne <- Nutcracker |>
  group_by(id) |>
  summarize(
    start = as.Date(min(datetime)),
    ende = as.Date(max(datetime)),
    alter = first(stage.at.capture)  # Alter pro Vogel
  ) |>
  filter(year(start) >= 2017) |>
  mutate(alter = case_when(
    alter %in% c("A", "adult") ~ "adult",
    alter %in% c("J", "juvenile", "sub") ~ "juvenile",
    TRUE ~ "unbekannt"
  )) |>
  arrange(start)

# Plot
ggplot(zeitspanne, aes(x = alter, y = n, fill = alter)) +
  geom_col(width = 0.6) +
  scale_fill_manual(values = c("juvenile" = "tomato", "adult" = "steelblue", "unbekannt" = "gray")) +
  labs(
    title = "Number of datapoints from juvenile and adult birds",
    x = "Age",
    y = "Number of datapoints",
    fill = "Age class"
  ) +
  theme_minimal()

# Plot with age and timespan
ggplot(zeitspanne, aes(y = reorder(id, start), color = alter)) +
  geom_linerange(aes(xmin = start, xmax = ende), linewidth = 1.2) +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  scale_color_manual(values = c("juvenile" = "tomato", "adult" = "steelblue", "unbekannt" = "gray")) +
  labs(title = "Time span of each bird", x = "Year", y = "Bird ID", color = "Age class") +
  theme_minimal()

About this chunk:

  • more datapoints of adult birds than from juveniles

  • large differences in time span between birds, some over multiple years, some over a few days

GPS-Shedules

# Filter years, hours and minutes
Nutcracker |>
  filter(year >= 2017) |>
  mutate(hour = as.numeric(format(datetime, "%H")),  
         minute = as.numeric(format(datetime, "%M")),  
         year = as.numeric(format(datetime, "%Y")),  
         month = as.numeric(format(datetime, "%m"))) -> Nutcracker_filtered

# Loop for plot creation
for (current_year in unique(Nutcracker_filtered$year)) {
  
  # Filter data by year
  data_year <- Nutcracker_filtered |>
    filter(year == current_year)
  
  # create plots
  plot <- ggplot(data_year, aes(x = hour + minute / 60)) +  
    geom_histogram(binwidth = 1, fill = "skyblue", color = "black", alpha = 0.7) + 
    facet_wrap(~ month, ncol = 3, scales = "free_y") +  # Facet by month
    labs(title = paste("Daytime distribution of GPS-Datapoints", current_year), 
         x = "Daytime (hour)", y = "number of datapoints") +
    theme_minimal() +
    scale_x_continuous(breaks = seq(0, 24, 5), limits = c(0, 24)) + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  # show plot
  print(plot)
}

#differences in GPS-Shedules
id_group <- c("5450", "6459", "7944")

# filter by id
p <- Nutcracker |>
  filter(id %in% id_group) |>
  arrange(id, datetime) |>
  group_by(id) |>
  mutate(time_diff = as.numeric(difftime(datetime, lag(datetime), units = "mins"))) |>
  filter(!is.na(time_diff), time_diff < 180) |>
  ggplot(aes(x = time_diff)) +
  geom_histogram(binwidth = 5, fill = "tomato", color = "black") +
  facet_wrap(~ id, scales = "free_y") +
  labs(
    title = paste("Time between two datapoints for ID's:", paste(id_group, collapse = ", ")),
    x = "Time (minutes)", y = "Frecuency"
  ) +
  theme_minimal()

# show Plot
print(p)

About this chunk:

  • different shedules between years and months

  • different timesteps between individuals

Conclusion:

The Nutcracker consists of 49758 GPS-locations collected by 115 individuals over multiple years. Each datapoint additionally holds information about the bird, body measurements and some additional information (34 variables). Due to the nature of the data and different research focuses in previous years, the number of datapoints per individual, season and year are highly variable. Additionally, due to battery restrictions, the timesteps between two datapoints, as well as the shedules, based on which the datapoints were collected, are also variable. This implications have to be kept in mind, when working with the data.