This document outlines different data checks used to validate the 2025 butternut health survey data.

This document illustrates the need for the data checks and how they work, while the data checks themselves are implemented with a seperate R file “2025_Implement_Data_Check_Results.R”.

Import data (May & June)

library(tidyverse)
library(readr)
library(DT)

## Warning: package 'DT' was built under R version 4.5.2

library(styler)

# Raw data
# health_assess_2025 <- read_csv("data/july_23_2025_Butternut Health Assessment Form (Responses).csv")

# Data with re-named columns from google form and removed unused columns
source("C:/Users/helmerhj/Documents/GitHub/2025-reu-temp/data_cleaning/2025_August_Data_Preparing_Columns.R")

## Warning: 1 parsing failure.
## row col expected actual
##  15  -- a number      s

## Warning: 1 parsing failure.
## row col expected actual
##  15  -- a number      d
## Warning: 1 parsing failure.
## row col expected actual
##  15  -- a number      d

data_collection_began <- ymd_hms("2025-06-12 00:00:00")
data_collection_end <- ymd_hms("2025-07-02 23:59:59")

health_assess_2025 <- health_assess_2025 %>% filter(
  between(timestamp, data_collection_began, data_collection_end)
)

Removing test entries & black walnuts

Entries used when making the form need to be removed.

# May test entries
test_entry_1 = ymd_hms("2025-06-06 15:06:31") # "EL & HH Testing"
test_entry_2 = ymd_hms("2025-06-06 15:08:27") # "EL & HH Testing again"

# June test entries
# Remove test entries & black walnut entry
test_entry_3 = ymd_hms("2025-07-02 09:41:40") # "Fake site - practice entry"
black_walnut = ymd_hms("2025-06-19 10:46:22") # "this is black walnut!"

test_entries = c(black_walnut, test_entry_1, test_entry_2, test_entry_3)

health_assess_2025 <- health_assess_2025 %>%
  filter(!timestamp %in% test_entries)

Site names

Override all the individuals within this timeframe to have a site_name of WCP

health_assess_2025 <- health_assess_2025 %>%
  mutate(site_name = "WCP"
  )

Plant numbers

Parsing the plant numbers as numbers rather than strings

test_health <- health_assess_2025
test_health$plant_number <- parse_number(test_health$plant_number)

plant_number_comparison <- tibble(
  original_plant_number = health_assess_2025$plant_number,
  clean_plant_number = test_health$plant_number
)

datatable(
  plant_number_comparison,
  options = list(
    pageLength = 10,
    scrollY = "400px",
    scrollX = TRUE
  ),
  class = "stripe hover row-border order-column" # forces light theme
)

health_assess_2025$plant_number <- parse_number(health_assess_2025$plant_number)

Canker areas

a. If ‘has_canker’ is NO then canker percentages (base/trunk/girdled) are 0.

test_health <- health_assess_2025
test_health <- test_health %>% mutate(base_canker_area = if_else(has_canker == "No", 0, base_canker_area))
test_health <- test_health %>% mutate(trunk_canker_area = if_else(has_canker == "No", 0, trunk_canker_area))
test_health <- test_health %>% mutate(girdled_canker_circum = if_else(has_canker == "No", 0, girdled_canker_circum))
 

canker_comparison <- tibble(
  has_canker = health_assess_2025$has_canker,
  original_trunk  = health_assess_2025$trunk_canker_area,
  clean_trunk = test_health$trunk_canker_area,
  original_base   = health_assess_2025$base_canker_area,
  clean_base  = test_health$base_canker_area,
  original_girdle = health_assess_2025$girdled_canker_circum,
  clean_girdle = test_health$girdled_canker_circum
)

canker_comparison_sorted <- canker_comparison %>%
  arrange(factor(has_canker, levels = c("No", "Yes")))  # "No" first, then "Yes"

datatable(
  canker_comparison_sorted,
  options = list(
    pageLength = 10,
    scrollY = "400px",
    scrollX = TRUE
  ),
  class = "stripe hover row-border order-column" # forces light theme
)

health_assess_2025$base_canker_area <- test_health$base_canker_area
health_assess_2025$trunk_canker_area <- test_health$trunk_canker_area
health_assess_2025$girdled_canker_circum <- test_health$girdled_canker_circum

Densiometer

a. Set to numeric

For a majority of entries that just means removing additional text like “open” etc., but there are some specific entries that need to be manually edited.

densio_cleaning <- health_assess_2025 %>%
  mutate(
    densio_north = parse_number(densio_north),
    densio_south = parse_number(densio_south),
    densio_east  = parse_number(densio_east),
    densio_west  = parse_number(densio_west)
  )

## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `densio_north = parse_number(densio_north)`.
## Caused by warning:
## ! 1 parsing failure.
## row col expected                                                                                                                                                                                                                                 actual
##  22  -- a number Not relevant because this inidivual's highest point is below the line of forb vegetation cover which is not consistent throughout the season. Therefore, the densiomwter read right now is not representative. Also, the tree is dead.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.

densio_comparison <- tibble(
  individual = densio_cleaning$plant_number,
  original_north = health_assess_2025$densio_north,
  clean_north    = densio_cleaning$densio_north,
  
  original_south = health_assess_2025$densio_south,
  clean_south    = densio_cleaning$densio_south,
  
  original_east  = health_assess_2025$densio_east,
  clean_east     = densio_cleaning$densio_east,
  
  original_west  = health_assess_2025$densio_west,
  clean_west     = densio_cleaning$densio_west
)

datatable(
  densio_comparison,
  options = list(
    pageLength = 10,
    scrollY = "400px",
    scrollX = TRUE
  ),
  class = "stripe hover row-border order-column" # forces light theme
)

b. Manual reseting values for unique entries

The above table indicates the following manual edits are necessary:

# Plants 6, 17, 38, 43, 44, 47, 71, 72

# Plant 6
    # Read: "Same as SH5"
    # SH5's densio readings: 48 56 94 96
row_index <- which(densio_cleaning$plant_number == 6)
densio_cleaning$densio_north[row_index] <- 48
densio_cleaning$densio_east[row_index]  <- 56
densio_cleaning$densio_south[row_index] <- 94
densio_cleaning$densio_west[row_index]  <- 96

# Plant 17
    # Read: "W - 33. E - 12. N - 68. S - 2. 30% open, 70% canopy cover"
row_index <- which(densio_cleaning$plant_number == 17)
densio_cleaning$densio_north[row_index] <- 68
densio_cleaning$densio_east[row_index]  <- 12
densio_cleaning$densio_south[row_index] <- 2
densio_cleaning$densio_west[row_index]  <- 33

# Plant 38
    # Read: "Not relevant because this individual's highest point is below the line of forb vegetation cover which is not consistent throughout the season. Therefore, the densiometer read right now is not representative. Also, the tree is dead."
row_index <- which(densio_cleaning$plant_number == 38)
densio_cleaning$densio_north[row_index] <- NA
densio_cleaning$densio_east[row_index]  <- NA
densio_cleaning$densio_south[row_index] <- NA
densio_cleaning$densio_west[row_index]  <- NA

# Plant 43
    # Read: "E - 7. S - 6. W - 1. N - 5. 19.76% open, 80.24% canopy cover"
row_index <- which(densio_cleaning$plant_number == 43)
densio_cleaning$densio_north[row_index] <- 5
densio_cleaning$densio_east[row_index]  <- 7
densio_cleaning$densio_south[row_index] <- 6
densio_cleaning$densio_west[row_index]  <- 1

# Plant 44
    # Read: "E - 8. S - 5. W - 10. N - 95 30.7% filled, 69.3% canopy cover"
row_index <- which(densio_cleaning$plant_number == 44)
densio_cleaning$densio_north[row_index] <- 95
densio_cleaning$densio_east[row_index]  <- 8
densio_cleaning$densio_south[row_index] <- 5
densio_cleaning$densio_west[row_index]  <- 10

# Plant 47
    # Read: "E - 7. N - 4. S - 12. W - 3. 27% open, 73% canopy cover."
row_index <- which(densio_cleaning$plant_number == 47)
densio_cleaning$densio_north[row_index] <- 4
densio_cleaning$densio_east[row_index]  <- 7
densio_cleaning$densio_south[row_index] <- 12
densio_cleaning$densio_west[row_index]  <- 3

# Plant 71
    # Read: "W - 32. E - 76. S - 10. N - 32. 39% empty or 61% canopy cover."
row_index <- which(densio_cleaning$plant_number == 71)
densio_cleaning$densio_north[row_index] <- 32
densio_cleaning$densio_east[row_index]  <- 76
densio_cleaning$densio_south[row_index] <- 10
densio_cleaning$densio_west[row_index]  <- 32

# Plant 72
    # Read: "S - 1. E - 56. N - 53. W - 54. 43% open or 57% canopy cover."
row_index <- which(densio_cleaning$plant_number == 72)
densio_cleaning$densio_north[row_index] <- 53
densio_cleaning$densio_east[row_index]  <- 56
densio_cleaning$densio_south[row_index] <- 1
densio_cleaning$densio_west[row_index]  <- 54

densio_comparison <- tibble(
  individual = densio_cleaning$plant_number,
  original_north = health_assess_2025$densio_north,
  clean_north    = densio_cleaning$densio_north,
  
  original_south = health_assess_2025$densio_south,
  clean_south    = densio_cleaning$densio_south,
  
  original_east  = health_assess_2025$densio_east,
  clean_east     = densio_cleaning$densio_east,
  
  original_west  = health_assess_2025$densio_west,
  clean_west     = densio_cleaning$densio_west
)

datatable(
  densio_comparison,
  options = list(
    pageLength = 10,
    scrollY = "400px",
    scrollX = TRUE
  ),
  class = "stripe hover row-border order-column" # forces light theme
)

# Overwrite original densio columns with cleaned versions
health_assess_2025$densio_north <- densio_cleaning$densio_north
health_assess_2025$densio_east  <- densio_cleaning$densio_east
health_assess_2025$densio_south <- densio_cleaning$densio_south
health_assess_2025$densio_west  <- densio_cleaning$densio_west

c. Calculating the densio_avg and canopy density

Create these aggregate columns from cleaned densiometer data.

health_assess_2025 <- health_assess_2025 %>%
  mutate(
    densio_average = rowMeans(
      select(., densio_west, densio_east, densio_north, densio_south),
      na.rm = TRUE
    ) * 1.04,
    canopy_density = 100 - densio_average
  )

Height, DBH and Is_Seedling Class

a. Converting all the height texts into feet measurements

Many of the height entries were first inputted as “1 foot 3 inches”, explicitly writing out the components. Instead, we want just a pure feet measurement in that column. So, I will extract the feet/foot numbers and inches and then convert those accordingly.

test_height_cleaning <- health_assess_2025 %>% select(plant_height_ft) %>% mutate(
  # Big picture height cleaning:
  #   * Assume in feet if no units are written into the box
  #   * If units are written in the box: extract them and re-calculate the heigh

  # ------------------ Process:
  # Clean up the text for consistency (e.g., remove extra spaces, make lowercase)
  height_str = str_to_lower(str_trim(plant_height_ft)),

  # Extract the first decimal number, this will always be the feet
    # Note that broadly "\\d+\\.?\\d*" selection nomenclature simily breaks down to: get the "Digits, maybe a dot, maybe more digits"
    # Where the "\\d+" gets all the first whole digits,
    # then the "\\.?" will check whether there is a literal decimal point,
    # if there is then "\\d*" gets all remaining digits
  feet_str = str_extract(height_str, "\\d+\\.?\\d*\\s*(ft)"),
  feet = as.numeric(str_extract(feet_str, "\\d+\\.?\\d*")),

  # Get the string of inches which will be based on either the presence of "inches" or "in",
    # e.g., "7 inches" or "7in"
  inches_str = str_extract(height_str, "\\d+\\.?\\d*\\s*(inches|in)"),
  # Extract the decimal from the isolated inches string, like the feet
  inches = as.numeric(str_extract(inches_str, "\\d+\\.?\\d*")),

  # Convert using the numerical values
    # Where 'coalesce' will use a 0 if feet/inches is an NA value
  calculated_from_text_feet = (coalesce(feet, 0)) + (coalesce(inches, 0) / 12.0),

  # Seeing if the entry has additional text in it like "ft" or "inches"
  contains_text = str_detect(height_str, "ft") | str_detect(height_str, "inches") | str_detect(height_str, "in"),

  # Assume the plant_height_ft is the string as a number if the entry doesn't have text.
  plant_height_ft_cleaned = if_else(!contains_text, as.numeric(height_str), calculated_from_text_feet)

) %>% select(plant_height_ft, plant_height_ft_cleaned)

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `plant_height_ft_cleaned = if_else(!contains_text,
##   as.numeric(height_str), calculated_from_text_feet)`.
## Caused by warning in `if_else()`:
## ! NAs introduced by coercion

height_comparison <- tibble(
  original_height_ft  = health_assess_2025$plant_height_ft,
  clean_height_ft = test_height_cleaning$plant_height_ft_cleaned
)


datatable(
  height_comparison,
  options = list(
    pageLength = 10,
    scrollY = "400px",
    scrollX = TRUE
  ),
  class = "stripe hover row-border order-column" # forces light theme
)

health_assess_2025 <- health_assess_2025 %>%
  mutate(plant_height_ft = test_height_cleaning$plant_height_ft_cleaned)

b. Verifying no densiometer readings for tall trees

For the first case of translating height entries from text to numbers there were explicit units like “1 foot 3 inches.” However, other individuals lacked explicit height units and were listed as “67”, for example.

In our case, it would be easy to have accidentally inputted inches because we were using a ruler for the seedlings. So, I went through and verified that the unitless entries were in fact in feet, and not inches, by accident. To start, we’ll consider all the trees and order by densiometer reading.

# Viewing subset of health_assess_2025 data
view_height <- health_assess_2025 %>% select(plant_number, dbh_cm, 
plant_height_ft, densio_north, additional_notes)

# Ordering with highest height at the top
view_height <- view_height[order(view_height$plant_height_ft, decreasing = T), ]

datatable(
  view_height,
  options = list(
    pageLength = 10,
    scrollY = "400px",
    scrollX = TRUE
  ),
  class = "stripe hover row-border order-column" # forces light theme
)

When we do this, only one tree stands out of place.

SH29 is labeled to have a height of 62 feet, yet also has a densiometer reading of 66

Let’s look into it specifically.

SH29

At closer inspection, SH29 had a written height of “62” with no units. With my current method of extracting heights, this individual was labeled as a 62 FEET tall individual. However, this individual does NOT have a DBH, indicting that this height may actually be in inches, not feet.

Observing the photos verifies this discrepancy and explains why there is a densiometer reading, as well.

Thus, I manually override SH29’s height to convert to inches:

row_index <- which(health_assess_2025$plant_number == 29)
health_assess_2025$plant_height_ft[row_index] <- (62.0 / 12)

c. Ensure seedling’s have densiometer readings

Slight background: Originally we were not very consistent with definiton of adult versus seedling. We began using if >4 feet then that tree should have a DBH and be identified as an adult.

However, in the end, it was ultimately whether we could take a densiometer reading which differentiated adults and seedlings. As such, the next check is seeing which individuals have densiometer readings and seedling classification; this by extension related to height. We’d expect individuals with a densiometer reading to be also labeled as seedlings in seedling_y_n.

test <- health_assess_2025 %>% select(plant_number, densio_north, seedling_y_n) 

datatable(
  test,
  options = list(
    pageLength = 10,
    scrollY = "400px",
    scrollX = TRUE
  ),
  class = "stripe hover row-border order-column" # forces light theme
)

Exploring this we find, we find no individuals labeled as adults with densiometer readings, great!

However, we do find individuals labeled as seedlings with no densiometer:

SH30
- Turns out this tree was totally cut and thus we didn’t take densiometer data on it. As such, it seems reasonable to remove this individual from analysis. I haven’t done this, but I wanted to note the oddity for future purpose.
SH38
- This seedling is dead! Additional notes read: “DEAD seedling - no sign of canker. Appears to have been death by shade”; as such it needs to be carefully considered. After finding this, I added the below parse dead individuals section.

Parse dead individuals

After finding SH38 to be dead in previous check, I manually F-searched in the sheet for additional notes noting ‘Dead’. If we want to combine this dataset with the updated form, which includes specific metrics for dead individuals, we can manually parse these individuals.

For now I simply remove them to parse in the future.

# SH38; "DEAD seedling - no   sign of canker. Appears to have been death by shade"

# SH2: "DEAD tree. Canopy class is based on structure, not live canopy. Callous is also based on whether cankers look like they have ever been calloused."

# SH50: "Stem is dead. Need to add a category for "seedling died" Unsure if 0 canker or 100% canker. Could have been killed by canker or the fungus is from after it died."
 
dead_individuals = c(2, 38, 50)

health_assess_2025 <- health_assess_2025 %>%
  filter(!plant_number %in% dead_individuals)

Purdue canker severity ratings (Canker & Canopy)

Removes additional text description

health_assess_2025 <- health_assess_2025 %>% mutate(
  purdue_severity_canker = recode(
    health_assess_2025$purdue_severity_canker,
    "1. Fewer than 3 active cankers that are all smaller than 2-3 inches in length or diameter OR fewer than 3 inactive cankers." = "1",
    "2. More than 3 active cankers, OR 2-5 shallow (with no dead tissue) healed over with cracks less than 7 inches long." = "2",
"3. More than 5 active OR inactive cankers cracked through the bark to the tissue below which have healed over, but you still see the level of damage." = "3",
"4. Cankers occur all over the 10-foot area, with deep cracks and both active and inactive cankers." = "4",
"5. Tree almost dead, mostly inactive cankers with deep cracks to dead tissue." = "5"
  )
)

Parse epicormics as Y/N

In the later versions of the survey, the epicormic question changed from a number how many epicormics to a Y or N. Here, I make that adjustment.

epicormic_cleaning <- health_assess_2025 %>%
  mutate(
    base_epicormics = if_else(!is.na(base_epicormics) & base_epicormics != 0, "Y", "N"),
    trunk_epicormics = if_else(!is.na(trunk_epicormics) & trunk_epicormics != 0, "Y", "N")
  )


epicormic_comparison <- tibble(
  individual = health_assess_2025$plant_number,
  original_base_ep = health_assess_2025$base_epicormics,
  clean_base_ep  = epicormic_cleaning$base_epicormics,
  
  original_trunk_ep = health_assess_2025$trunk_epicormics,
  clean_trunk_ep    = epicormic_cleaning$trunk_epicormics,
)

datatable(
  epicormic_comparison,
  options = list(
    pageLength = 10,
    scrollY = "400px",
    scrollX = TRUE
  ),
  class = "stripe hover row-border order-column" # forces light theme
)

health_assess_2025 <- health_assess_2025 %>%
  mutate(base_epicormics = epicormic_cleaning$base_epicormics)

health_assess_2025 <- health_assess_2025 %>%
  mutate(trunk_epicormics = epicormic_cleaning$trunk_epicormics)

Seedlings don’t have callouses

Later in form we decided to stop trying to assess callouses on seedlings, as such their callous information needs to be updated

seedlings_assess_2025 <- health_assess_2025 %>% filter(seedling_y_n == "Yes")

seedling_clean <- seedlings_assess_2025 %>%
  mutate(
    has_callous = if_else(seedling_y_n == "Yes", NA, has_callous)
  )

seedling_comparison <- tibble(
  individual = seedlings_assess_2025$plant_number,
  original_callous = seedlings_assess_2025$has_callous,
  clean_callous  = seedling_clean$has_callous,
)

datatable(
  seedling_comparison,
  options = list(
    pageLength = 10,
    scrollY = "400px",
    scrollX = TRUE
  ),
  class = "stripe hover row-border order-column" # forces light theme
)

health_assess_2025 <- health_assess_2025 %>%
  mutate(
    has_callous = if_else(seedling_y_n == "Yes", NA, has_callous)
  )

Individual-by-individual revisions

Space to make case-by-case adjustments based on the individual-focused photo reviews.

# SH37
# - this tree had 3 seperate entries describing the information about the same individual, so they can be aggregated into one entry manually

Completely checked

datatable(
  health_assess_2025,
  options = list(
    pageLength = 10,
    scrollY = "400px",
    scrollX = TRUE
  ),
  class = "stripe hover row-border order-column" # forces light theme
)

write.csv(health_assess_2025, file = "may_june_cleaned_2025.csv", row.names = FALSE)

2025_data_checks

2026-01-31