This document outlines different data checks used to validate the 2025 butternut health survey data.
This document illustrates the need for the data checks and how they work, while the data checks themselves are implemented with a seperate R file “2025_Implement_Data_Check_Results.R”.
library(tidyverse)
library(readr)
library(DT)
## Warning: package 'DT' was built under R version 4.5.2
library(styler)
# Raw data
# health_assess_2025 <- read_csv("data/july_23_2025_Butternut Health Assessment Form (Responses).csv")
# Data with re-named columns from google form and removed unused columns
source("C:/Users/helmerhj/Documents/GitHub/2025-reu-temp/data_cleaning/2025_August_Data_Preparing_Columns.R")
## Warning: 1 parsing failure.
## row col expected actual
## 15 -- a number s
## Warning: 1 parsing failure.
## row col expected actual
## 15 -- a number d
## Warning: 1 parsing failure.
## row col expected actual
## 15 -- a number d
data_collection_began <- ymd_hms("2025-06-12 00:00:00")
data_collection_end <- ymd_hms("2025-07-02 23:59:59")
health_assess_2025 <- health_assess_2025 %>% filter(
between(timestamp, data_collection_began, data_collection_end)
)
Entries used when making the form need to be removed.
# May test entries
test_entry_1 = ymd_hms("2025-06-06 15:06:31") # "EL & HH Testing"
test_entry_2 = ymd_hms("2025-06-06 15:08:27") # "EL & HH Testing again"
# June test entries
# Remove test entries & black walnut entry
test_entry_3 = ymd_hms("2025-07-02 09:41:40") # "Fake site - practice entry"
black_walnut = ymd_hms("2025-06-19 10:46:22") # "this is black walnut!"
test_entries = c(black_walnut, test_entry_1, test_entry_2, test_entry_3)
health_assess_2025 <- health_assess_2025 %>%
filter(!timestamp %in% test_entries)
Override all the individuals within this timeframe to have a site_name of WCP
health_assess_2025 <- health_assess_2025 %>%
mutate(site_name = "WCP"
)
Parsing the plant numbers as numbers rather than strings
test_health <- health_assess_2025
test_health$plant_number <- parse_number(test_health$plant_number)
plant_number_comparison <- tibble(
original_plant_number = health_assess_2025$plant_number,
clean_plant_number = test_health$plant_number
)
datatable(
plant_number_comparison,
options = list(
pageLength = 10,
scrollY = "400px",
scrollX = TRUE
),
class = "stripe hover row-border order-column" # forces light theme
)
health_assess_2025$plant_number <- parse_number(health_assess_2025$plant_number)
test_health <- health_assess_2025
test_health <- test_health %>% mutate(base_canker_area = if_else(has_canker == "No", 0, base_canker_area))
test_health <- test_health %>% mutate(trunk_canker_area = if_else(has_canker == "No", 0, trunk_canker_area))
test_health <- test_health %>% mutate(girdled_canker_circum = if_else(has_canker == "No", 0, girdled_canker_circum))
canker_comparison <- tibble(
has_canker = health_assess_2025$has_canker,
original_trunk = health_assess_2025$trunk_canker_area,
clean_trunk = test_health$trunk_canker_area,
original_base = health_assess_2025$base_canker_area,
clean_base = test_health$base_canker_area,
original_girdle = health_assess_2025$girdled_canker_circum,
clean_girdle = test_health$girdled_canker_circum
)
canker_comparison_sorted <- canker_comparison %>%
arrange(factor(has_canker, levels = c("No", "Yes"))) # "No" first, then "Yes"
datatable(
canker_comparison_sorted,
options = list(
pageLength = 10,
scrollY = "400px",
scrollX = TRUE
),
class = "stripe hover row-border order-column" # forces light theme
)
health_assess_2025$base_canker_area <- test_health$base_canker_area
health_assess_2025$trunk_canker_area <- test_health$trunk_canker_area
health_assess_2025$girdled_canker_circum <- test_health$girdled_canker_circum
For a majority of entries that just means removing additional text like “open” etc., but there are some specific entries that need to be manually edited.
densio_cleaning <- health_assess_2025 %>%
mutate(
densio_north = parse_number(densio_north),
densio_south = parse_number(densio_south),
densio_east = parse_number(densio_east),
densio_west = parse_number(densio_west)
)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `densio_north = parse_number(densio_north)`.
## Caused by warning:
## ! 1 parsing failure.
## row col expected actual
## 22 -- a number Not relevant because this inidivual's highest point is below the line of forb vegetation cover which is not consistent throughout the season. Therefore, the densiomwter read right now is not representative. Also, the tree is dead.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
densio_comparison <- tibble(
individual = densio_cleaning$plant_number,
original_north = health_assess_2025$densio_north,
clean_north = densio_cleaning$densio_north,
original_south = health_assess_2025$densio_south,
clean_south = densio_cleaning$densio_south,
original_east = health_assess_2025$densio_east,
clean_east = densio_cleaning$densio_east,
original_west = health_assess_2025$densio_west,
clean_west = densio_cleaning$densio_west
)
datatable(
densio_comparison,
options = list(
pageLength = 10,
scrollY = "400px",
scrollX = TRUE
),
class = "stripe hover row-border order-column" # forces light theme
)
The above table indicates the following manual edits are necessary:
# Plants 6, 17, 38, 43, 44, 47, 71, 72
# Plant 6
# Read: "Same as SH5"
# SH5's densio readings: 48 56 94 96
row_index <- which(densio_cleaning$plant_number == 6)
densio_cleaning$densio_north[row_index] <- 48
densio_cleaning$densio_east[row_index] <- 56
densio_cleaning$densio_south[row_index] <- 94
densio_cleaning$densio_west[row_index] <- 96
# Plant 17
# Read: "W - 33. E - 12. N - 68. S - 2. 30% open, 70% canopy cover"
row_index <- which(densio_cleaning$plant_number == 17)
densio_cleaning$densio_north[row_index] <- 68
densio_cleaning$densio_east[row_index] <- 12
densio_cleaning$densio_south[row_index] <- 2
densio_cleaning$densio_west[row_index] <- 33
# Plant 38
# Read: "Not relevant because this individual's highest point is below the line of forb vegetation cover which is not consistent throughout the season. Therefore, the densiometer read right now is not representative. Also, the tree is dead."
row_index <- which(densio_cleaning$plant_number == 38)
densio_cleaning$densio_north[row_index] <- NA
densio_cleaning$densio_east[row_index] <- NA
densio_cleaning$densio_south[row_index] <- NA
densio_cleaning$densio_west[row_index] <- NA
# Plant 43
# Read: "E - 7. S - 6. W - 1. N - 5. 19.76% open, 80.24% canopy cover"
row_index <- which(densio_cleaning$plant_number == 43)
densio_cleaning$densio_north[row_index] <- 5
densio_cleaning$densio_east[row_index] <- 7
densio_cleaning$densio_south[row_index] <- 6
densio_cleaning$densio_west[row_index] <- 1
# Plant 44
# Read: "E - 8. S - 5. W - 10. N - 95 30.7% filled, 69.3% canopy cover"
row_index <- which(densio_cleaning$plant_number == 44)
densio_cleaning$densio_north[row_index] <- 95
densio_cleaning$densio_east[row_index] <- 8
densio_cleaning$densio_south[row_index] <- 5
densio_cleaning$densio_west[row_index] <- 10
# Plant 47
# Read: "E - 7. N - 4. S - 12. W - 3. 27% open, 73% canopy cover."
row_index <- which(densio_cleaning$plant_number == 47)
densio_cleaning$densio_north[row_index] <- 4
densio_cleaning$densio_east[row_index] <- 7
densio_cleaning$densio_south[row_index] <- 12
densio_cleaning$densio_west[row_index] <- 3
# Plant 71
# Read: "W - 32. E - 76. S - 10. N - 32. 39% empty or 61% canopy cover."
row_index <- which(densio_cleaning$plant_number == 71)
densio_cleaning$densio_north[row_index] <- 32
densio_cleaning$densio_east[row_index] <- 76
densio_cleaning$densio_south[row_index] <- 10
densio_cleaning$densio_west[row_index] <- 32
# Plant 72
# Read: "S - 1. E - 56. N - 53. W - 54. 43% open or 57% canopy cover."
row_index <- which(densio_cleaning$plant_number == 72)
densio_cleaning$densio_north[row_index] <- 53
densio_cleaning$densio_east[row_index] <- 56
densio_cleaning$densio_south[row_index] <- 1
densio_cleaning$densio_west[row_index] <- 54
densio_comparison <- tibble(
individual = densio_cleaning$plant_number,
original_north = health_assess_2025$densio_north,
clean_north = densio_cleaning$densio_north,
original_south = health_assess_2025$densio_south,
clean_south = densio_cleaning$densio_south,
original_east = health_assess_2025$densio_east,
clean_east = densio_cleaning$densio_east,
original_west = health_assess_2025$densio_west,
clean_west = densio_cleaning$densio_west
)
datatable(
densio_comparison,
options = list(
pageLength = 10,
scrollY = "400px",
scrollX = TRUE
),
class = "stripe hover row-border order-column" # forces light theme
)
# Overwrite original densio columns with cleaned versions
health_assess_2025$densio_north <- densio_cleaning$densio_north
health_assess_2025$densio_east <- densio_cleaning$densio_east
health_assess_2025$densio_south <- densio_cleaning$densio_south
health_assess_2025$densio_west <- densio_cleaning$densio_west
Create these aggregate columns from cleaned densiometer data.
health_assess_2025 <- health_assess_2025 %>%
mutate(
densio_average = rowMeans(
select(., densio_west, densio_east, densio_north, densio_south),
na.rm = TRUE
) * 1.04,
canopy_density = 100 - densio_average
)
Many of the height entries were first inputted as “1 foot 3 inches”, explicitly writing out the components. Instead, we want just a pure feet measurement in that column. So, I will extract the feet/foot numbers and inches and then convert those accordingly.
test_height_cleaning <- health_assess_2025 %>% select(plant_height_ft) %>% mutate(
# Big picture height cleaning:
# * Assume in feet if no units are written into the box
# * If units are written in the box: extract them and re-calculate the heigh
# ------------------ Process:
# Clean up the text for consistency (e.g., remove extra spaces, make lowercase)
height_str = str_to_lower(str_trim(plant_height_ft)),
# Extract the first decimal number, this will always be the feet
# Note that broadly "\\d+\\.?\\d*" selection nomenclature simily breaks down to: get the "Digits, maybe a dot, maybe more digits"
# Where the "\\d+" gets all the first whole digits,
# then the "\\.?" will check whether there is a literal decimal point,
# if there is then "\\d*" gets all remaining digits
feet_str = str_extract(height_str, "\\d+\\.?\\d*\\s*(ft)"),
feet = as.numeric(str_extract(feet_str, "\\d+\\.?\\d*")),
# Get the string of inches which will be based on either the presence of "inches" or "in",
# e.g., "7 inches" or "7in"
inches_str = str_extract(height_str, "\\d+\\.?\\d*\\s*(inches|in)"),
# Extract the decimal from the isolated inches string, like the feet
inches = as.numeric(str_extract(inches_str, "\\d+\\.?\\d*")),
# Convert using the numerical values
# Where 'coalesce' will use a 0 if feet/inches is an NA value
calculated_from_text_feet = (coalesce(feet, 0)) + (coalesce(inches, 0) / 12.0),
# Seeing if the entry has additional text in it like "ft" or "inches"
contains_text = str_detect(height_str, "ft") | str_detect(height_str, "inches") | str_detect(height_str, "in"),
# Assume the plant_height_ft is the string as a number if the entry doesn't have text.
plant_height_ft_cleaned = if_else(!contains_text, as.numeric(height_str), calculated_from_text_feet)
) %>% select(plant_height_ft, plant_height_ft_cleaned)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `plant_height_ft_cleaned = if_else(!contains_text,
## as.numeric(height_str), calculated_from_text_feet)`.
## Caused by warning in `if_else()`:
## ! NAs introduced by coercion
height_comparison <- tibble(
original_height_ft = health_assess_2025$plant_height_ft,
clean_height_ft = test_height_cleaning$plant_height_ft_cleaned
)
datatable(
height_comparison,
options = list(
pageLength = 10,
scrollY = "400px",
scrollX = TRUE
),
class = "stripe hover row-border order-column" # forces light theme
)
health_assess_2025 <- health_assess_2025 %>%
mutate(plant_height_ft = test_height_cleaning$plant_height_ft_cleaned)
For the first case of translating height entries from text to numbers there were explicit units like “1 foot 3 inches.” However, other individuals lacked explicit height units and were listed as “67”, for example.
In our case, it would be easy to have accidentally inputted inches because we were using a ruler for the seedlings. So, I went through and verified that the unitless entries were in fact in feet, and not inches, by accident. To start, we’ll consider all the trees and order by densiometer reading.
# Viewing subset of health_assess_2025 data
view_height <- health_assess_2025 %>% select(plant_number, dbh_cm,
plant_height_ft, densio_north, additional_notes)
# Ordering with highest height at the top
view_height <- view_height[order(view_height$plant_height_ft, decreasing = T), ]
datatable(
view_height,
options = list(
pageLength = 10,
scrollY = "400px",
scrollX = TRUE
),
class = "stripe hover row-border order-column" # forces light theme
)
When we do this, only one tree stands out of place.
Let’s look into it specifically.
At closer inspection, SH29 had a written height of “62” with no units. With my current method of extracting heights, this individual was labeled as a 62 FEET tall individual. However, this individual does NOT have a DBH, indicting that this height may actually be in inches, not feet.
Observing the photos verifies this discrepancy and explains why there is a densiometer reading, as well.
Thus, I manually override SH29’s height to convert to inches:
row_index <- which(health_assess_2025$plant_number == 29)
health_assess_2025$plant_height_ft[row_index] <- (62.0 / 12)
Slight background: Originally we were not very consistent with definiton of adult versus seedling. We began using if >4 feet then that tree should have a DBH and be identified as an adult.
However, in the end, it was ultimately whether we could take a densiometer reading which differentiated adults and seedlings. As such, the next check is seeing which individuals have densiometer readings and seedling classification; this by extension related to height. We’d expect individuals with a densiometer reading to be also labeled as seedlings in seedling_y_n.
test <- health_assess_2025 %>% select(plant_number, densio_north, seedling_y_n)
datatable(
test,
options = list(
pageLength = 10,
scrollY = "400px",
scrollX = TRUE
),
class = "stripe hover row-border order-column" # forces light theme
)
Exploring this we find, we find no individuals labeled as adults with densiometer readings, great!
However, we do find individuals labeled as seedlings with no densiometer:
SH30
SH38
After finding SH38 to be dead in previous check, I manually F-searched in the sheet for additional notes noting ‘Dead’. If we want to combine this dataset with the updated form, which includes specific metrics for dead individuals, we can manually parse these individuals.
For now I simply remove them to parse in the future.
# SH38; "DEAD seedling - no sign of canker. Appears to have been death by shade"
# SH2: "DEAD tree. Canopy class is based on structure, not live canopy. Callous is also based on whether cankers look like they have ever been calloused."
# SH50: "Stem is dead. Need to add a category for "seedling died" Unsure if 0 canker or 100% canker. Could have been killed by canker or the fungus is from after it died."
dead_individuals = c(2, 38, 50)
health_assess_2025 <- health_assess_2025 %>%
filter(!plant_number %in% dead_individuals)
Removes additional text description
health_assess_2025 <- health_assess_2025 %>% mutate(
purdue_severity_canker = recode(
health_assess_2025$purdue_severity_canker,
"1. Fewer than 3 active cankers that are all smaller than 2-3 inches in length or diameter OR fewer than 3 inactive cankers." = "1",
"2. More than 3 active cankers, OR 2-5 shallow (with no dead tissue) healed over with cracks less than 7 inches long." = "2",
"3. More than 5 active OR inactive cankers cracked through the bark to the tissue below which have healed over, but you still see the level of damage." = "3",
"4. Cankers occur all over the 10-foot area, with deep cracks and both active and inactive cankers." = "4",
"5. Tree almost dead, mostly inactive cankers with deep cracks to dead tissue." = "5"
)
)
In the later versions of the survey, the epicormic question changed from a number how many epicormics to a Y or N. Here, I make that adjustment.
epicormic_cleaning <- health_assess_2025 %>%
mutate(
base_epicormics = if_else(!is.na(base_epicormics) & base_epicormics != 0, "Y", "N"),
trunk_epicormics = if_else(!is.na(trunk_epicormics) & trunk_epicormics != 0, "Y", "N")
)
epicormic_comparison <- tibble(
individual = health_assess_2025$plant_number,
original_base_ep = health_assess_2025$base_epicormics,
clean_base_ep = epicormic_cleaning$base_epicormics,
original_trunk_ep = health_assess_2025$trunk_epicormics,
clean_trunk_ep = epicormic_cleaning$trunk_epicormics,
)
datatable(
epicormic_comparison,
options = list(
pageLength = 10,
scrollY = "400px",
scrollX = TRUE
),
class = "stripe hover row-border order-column" # forces light theme
)
health_assess_2025 <- health_assess_2025 %>%
mutate(base_epicormics = epicormic_cleaning$base_epicormics)
health_assess_2025 <- health_assess_2025 %>%
mutate(trunk_epicormics = epicormic_cleaning$trunk_epicormics)
Later in form we decided to stop trying to assess callouses on seedlings, as such their callous information needs to be updated
seedlings_assess_2025 <- health_assess_2025 %>% filter(seedling_y_n == "Yes")
seedling_clean <- seedlings_assess_2025 %>%
mutate(
has_callous = if_else(seedling_y_n == "Yes", NA, has_callous)
)
seedling_comparison <- tibble(
individual = seedlings_assess_2025$plant_number,
original_callous = seedlings_assess_2025$has_callous,
clean_callous = seedling_clean$has_callous,
)
datatable(
seedling_comparison,
options = list(
pageLength = 10,
scrollY = "400px",
scrollX = TRUE
),
class = "stripe hover row-border order-column" # forces light theme
)
health_assess_2025 <- health_assess_2025 %>%
mutate(
has_callous = if_else(seedling_y_n == "Yes", NA, has_callous)
)
Space to make case-by-case adjustments based on the individual-focused photo reviews.
# SH37
# - this tree had 3 seperate entries describing the information about the same individual, so they can be aggregated into one entry manually
datatable(
health_assess_2025,
options = list(
pageLength = 10,
scrollY = "400px",
scrollX = TRUE
),
class = "stripe hover row-border order-column" # forces light theme
)
write.csv(health_assess_2025, file = "may_june_cleaned_2025.csv", row.names = FALSE)