Tidying data

Import

library(tidyverse)
library(readr)
library(styler)
health_assess_2024 <- read_csv("july_3_2024_butternut_health_assessment_forms.csv")
# view(health_assess_2024)

Select relevant WCP entries

# Get only the WCP observations (first 66)
#health_assess_2024 <-  filter(health_assess_2024, health_assess_2024$`Site number or initial` == "WCP")

Renaming columns & creating variables for readability

2024

# Plant Height (ft)
health_assess_2024 <- health_assess_2024 %>% rename(plant_height_ft = `Plant height (ft)`)
plant_height_ft_2024 <- health_assess_2024$plant_height_ft

# DBH
health_assess_2024 <- health_assess_2024 %>% rename(dbh_cm = `DBH (cm)`)
dbh_cm_2024 <- health_assess_2024$dbh_cm

# % live canopy
health_assess_2024 <- health_assess_2024 %>% rename(percent_live_canopy = `Percent live canopy`)
percent_live_canopy_2024 <- health_assess_2024$percent_live_canopy

# base_epicormics_2024
health_assess_2024 <- health_assess_2024 %>% rename(base_epicormics = `Number of epicormic branches/ sprouts from the base`)
base_epicormics_2024 <- health_assess_2024$base_epicormics

# trunk_epicormics_2024
health_assess_2024 <- health_assess_2024 %>% rename(trunk_epicormics = `Number of epicormic branches/ sprouts from the trunk`)
trunk_epicormics_2024 <- health_assess_2024$trunk_epicormics

# has_canker_2024
health_assess_2024 <- health_assess_2024 %>% rename(has_canker = `Visible cankers? (according to health assessment form)`)
has_canker_2024 <- health_assess_2024$has_canker

# has_callous_2024
health_assess_2024 <- health_assess_2024 %>% rename(has_callous = `If large cankers present, are they being calloused over?`)
has_callous_2024 <- health_assess_2024$has_callous

# trunk_canker_area_2024
health_assess_2024 <- health_assess_2024 %>% rename(trunk_canker_area = `Area of trunk infected by canker (%)`)
trunk_canker_area_2024 <- health_assess_2024$trunk_canker_area

# base_canker_area_2024
health_assess_2024 <- health_assess_2024 %>% rename(base_canker_area = `Area of base/ root flare infected by canker (%)`)
base_canker_area_2024 <- health_assess_2024$base_canker_area

# purdue_severity_2024
health_assess_2024 <- health_assess_2024 %>% rename(purdue_severity = `If trees are infected, severity of infection`)
purdue_severity_2024 <- health_assess_2024$purdue_severity

# seedling_y_n_2024
health_assess_2024 <- health_assess_2024 %>% rename(seedling_y_n = `Seedling (Y/N)`)
seedling_y_n_2024 <- health_assess_2024$seedling_y_n

Fixing typing

2024

plant_height_ft_2024 <- as.numeric(plant_height_ft_2024)

## Warning: NAs introduced by coercion

dbh_cm_2024 <- as.numeric(dbh_cm_2024)

## Warning: NAs introduced by coercion

percent_live_canopy_2024 <- as.numeric(percent_live_canopy_2024)

## Warning: NAs introduced by coercion

base_canker_area_2024 <- as.numeric(base_canker_area_2024)

## Warning: NAs introduced by coercion

trunk_canker_area_2024 <- as.numeric(trunk_canker_area_2024)

## Warning: NAs introduced by coercion

Data exploration

Summary of data

print("plant_height_ft_2024------------------------------")

## [1] "plant_height_ft_2024------------------------------"

summary(plant_height_ft_2024)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00    2.50    7.00   14.81   24.00   67.00      33

print("dbh_cm_2024------------------------------")

## [1] "dbh_cm_2024------------------------------"

summary(dbh_cm_2024)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.635   1.979   5.080  11.462  17.775  63.900      76

print("percent_live_canopy_2024------------------------------")

## [1] "percent_live_canopy_2024------------------------------"

summary(percent_live_canopy_2024)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   90.00  100.00   87.49  100.00  100.00      56

print("base_epicormics_2024------------------------------")

## [1] "base_epicormics_2024------------------------------"

summary(base_epicormics_2024)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.6322  0.0000 14.0000      47

print("trunk_epicormics_2024------------------------------")

## [1] "trunk_epicormics_2024------------------------------"

summary(trunk_epicormics_2024)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.3765  0.0000  3.0000      49

print("has_canker_2024------------------------------")

## [1] "has_canker_2024------------------------------"

summary(has_canker_2024)

##    Length     Class      Mode 
##       134 character character

print("has_callous_2024------------------------------")

## [1] "has_callous_2024------------------------------"

summary(has_callous_2024)

##    Length     Class      Mode 
##       134 character character

print("trunk_canker_area_2024------------------------------")

## [1] "trunk_canker_area_2024------------------------------"

summary(trunk_canker_area_2024)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00   20.00   21.48   30.00   90.00      38

print("base_canker_area_2024------------------------------")

## [1] "base_canker_area_2024------------------------------"

summary(base_canker_area_2024)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00    0.00   14.81   15.00   90.00      57

print("purdue_severity_2024------------------------------")

## [1] "purdue_severity_2024------------------------------"

summary(purdue_severity_2024)

##    Length     Class      Mode 
##       134 character character

Proportion of seedlings

bar_seedlings <- health_assess_2024 %>% ggplot(aes(x = seedling_y_n_2024)) +
  geom_bar(aes(fill = seedling_y_n_2024))

bar_seedlings

Plant Height & DBH

library(patchwork)

# Plant Height (ft)
mean_plant_height <- mean(plant_height_ft_2024, na.rm = TRUE)
median_plant_height <- median(plant_height_ft_2024, na.rm = TRUE)
print(mean_plant_height)

## [1] 14.81188

hist_plant_height <- health_assess_2024 %>% ggplot(aes(x = plant_height_ft_2024)) +
  geom_histogram(bins = 30, aes(fill = seedling_y_n_2024)) + 
  ylab("Number of Individuals") + 
  # Mean
  geom_vline(
    aes(xintercept = mean_plant_height),
    color = "#bdbdbd",
    linetype = "dashed",
    size = 1
  ) +
  annotate("text",
           color = "#bdbdbd",
           x = 17,
           y = 15,
           label = paste("Mean =", round(mean_plant_height, 2), "ft")) +
  
  # Median
  geom_vline(
    aes(xintercept = median_plant_height),
    color = "#636363",
    linetype = "solid",
    size = 1
  ) +
  annotate("text",
           color = "#636363",
           x = 11,
           y = 20,
           label = paste("Median =", round(median_plant_height, 2), "ft"))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

hist_plant_height

## Warning: Removed 33 rows containing non-finite outside the scale range
## (`stat_bin()`).

# DBH (cm)
mean_dbh <- mean(dbh_cm_2024, na.rm = TRUE)
median_dbh <- median(dbh_cm_2024, na.rm = TRUE)

hist_dbh <- health_assess_2024 %>% ggplot(aes(x = dbh_cm_2024)) +
  geom_histogram(bins = 30, aes(fill = seedling_y_n_2024)) + 
  ylab("Number of Individuals") + 
  # Mean
  geom_vline(
    aes(xintercept = mean_dbh),
    color = "#bdbdbd",
    linetype = "dashed",
    size = 1
  ) +
  annotate("text",
           color = "#bdbdbd",
           x = 15,
           y = 5,
           label = paste("Mean =", round(mean_dbh, 2), "cm")) +
  
  # Median
  geom_vline(
    aes(xintercept = median_dbh),
    color = "#636363",
    linetype = "solid",
    size = 1
  ) +
  annotate("text",
           color = "#636363",
           x = 11,
           y = 8,
           label = paste("Median =", round(median_dbh, 2), "cm")) 
hist_dbh

## Warning: Removed 76 rows containing non-finite outside the scale range
## (`stat_bin()`).

hist_plant_height / hist_dbh

## Warning: Removed 33 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 76 rows containing non-finite outside the scale range
## (`stat_bin()`).

% Live Canopy

mean_percent_live_canopy <- mean(percent_live_canopy_2024, na.rm = TRUE)
median_percent_live_canopy <- median(percent_live_canopy_2024, na.rm = TRUE)

hist_percent_live_canopy_2024 <- health_assess_2024 %>% 
  ggplot(aes(x = percent_live_canopy_2024)) +
  xlab("% Live Canopy in 2024") +
  ylab("Number of Individuals") + 
  geom_histogram(bins = 30, aes(fill = seedling_y_n_2024)) + 
  # Mean
  geom_vline(
    aes(xintercept = mean_percent_live_canopy),
    color = "#bdbdbd",
    linetype = "dashed",
    size = 1
  ) +
  annotate("text",
           color = "#bdbdbd",
           x = 70,
           y = 20, 
           label = paste("Mean =", round(mean_percent_live_canopy, 2), "%")) +
  
  # Median
  geom_vline(
    aes(xintercept = median_percent_live_canopy),
    color = "#636363",
    linetype = "solid",
    size = 1
  ) +
  annotate("text",
           color = "#636363",
           x = 85,
           y = 30,
           label = paste("Median =", round(median_percent_live_canopy, 2), "%")) 

hist_percent_live_canopy_2024

## Warning: Removed 56 rows containing non-finite outside the scale range
## (`stat_bin()`).

Epicormics

# ------------ BASE

mean_hist_epicormic_base <- mean(base_epicormics_2024, na.rm = TRUE)
median_hist_epicormic_base <- median(base_epicormics_2024, na.rm = TRUE)

hist_epicormic_base <- health_assess_2024 %>% ggplot(aes(x = base_epicormics_2024)) +
  geom_histogram(aes(fill = seedling_y_n_2024)) +
  ylab("Number of Individuals") + 
  geom_histogram(bins = 30, aes(fill = seedling_y_n_2024)) + 
  # Mean
  geom_vline(
    aes(xintercept = mean_hist_epicormic_base),
    color = "#bdbdbd",
    linetype = "dashed",
    size = 1
  ) +
  annotate("text",
           color = "#bdbdbd",
           x = 1.3,
           y = 20, 
           label = paste("Mean =", round(mean_hist_epicormic_base, 2))) +
  
  # Median
  geom_vline(
    aes(xintercept = median_hist_epicormic_base),
    color = "#636363",
    linetype = "solid",
    size = 1
  ) +
  annotate("text",
           color = "#636363",
           x = 0.7,
           y = 30,
           label = paste("Median =", round(median_hist_epicormic_base, 2)))

# ------------ TRUNK
mean <- mean(trunk_epicormics_2024, na.rm = TRUE)
median <- median(trunk_epicormics_2024, na.rm = TRUE)

hist_epicormic_trunk <- health_assess_2024 %>% ggplot(aes(x = trunk_epicormics_2024)) +
  geom_histogram(aes(fill = seedling_y_n_2024)) + 
  ylab("Number of Individuals") + 
  # Mean
  geom_vline(
    aes(xintercept = mean),
    color = "#bdbdbd",
    linetype = "dashed",
    size = 1
  ) +
  annotate("text",
           color = "#bdbdbd",
           x = 0.6,
           y = 20, 
           label = paste("Mean =", round(mean, 2))) +
  
  # Median
  geom_vline(
    aes(xintercept = median),
    color = "#636363",
    linetype = "solid",
    size = 1
  ) +
  annotate("text",
           color = "#636363",
           x = 0.4,
           y = 30,
           label = paste("Median =", round(median, 2)))

hist_epicormic_base / hist_epicormic_trunk

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 47 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 47 rows containing non-finite outside the scale range (`stat_bin()`).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 49 rows containing non-finite outside the scale range
## (`stat_bin()`).

Has Canker & Callous

library(viridis)

## Loading required package: viridisLite

bar_has_canker_2024 <- health_assess_2024 %>% ggplot(aes(x = has_canker)) +
  geom_bar() # aes(fill = has_callous_2024))

bar_has_callous_2024 <- health_assess_2024 %>% ggplot(aes(x = has_callous_2024)) +
  geom_bar() # aes(fill = purdue_severity_2024))

bar_has_canker_2024 + bar_has_callous_2024

Canker Areas

mean_base_area <- mean(base_canker_area_2024, na.rm = TRUE)
median_base_area <- median(base_canker_area_2024, na.rm = TRUE)

hist_base_canker <- health_assess_2024 %>% ggplot(aes(x = base_canker_area_2024)) +
  geom_histogram() + # aes(fill = has_callous_2024))
  ylab("Number of Individuals") + 
  # Mean
  geom_vline(
    aes(xintercept = mean_base_area),
    color = "#bdbdbd",
    linetype = "dashed",
    size = 1
  ) +
  annotate("text",
           color = "#bdbdbd",
           x = 26,
           y = 10, 
           label = paste("Mean =", round(mean_base_area, 2), "%")) +
  
  # Median
  geom_vline(
    aes(xintercept = median_base_area),
    color = "#636363",
    linetype = "solid",
    size = 1
  ) +
  annotate("text",
           color = "#636363",
           x = 10,
           y = 25,
           label = paste("Median =", round(median_base_area, 2), "%"))

mean_trunk_area <- mean(trunk_canker_area_2024, na.rm = TRUE)
median_trunk_area <- median(trunk_canker_area_2024, na.rm = TRUE)

hist_trunk_canker <- health_assess_2024 %>% ggplot(aes(x = trunk_canker_area_2024)) +
  geom_histogram() + # aes(fill = has_callous_2024))
  ylab("Number of Individuals") + 
  # Mean
  geom_vline(
    aes(xintercept = mean_trunk_area),
    color = "#bdbdbd",
    linetype = "dashed",
    size = 1
  ) +
  annotate("text",
           color = "#bdbdbd",
           x = 32,
           y = 10, 
           label = paste("Mean =", round(mean_trunk_area, 2), "%")) +
  
  # Median
  geom_vline(
    aes(xintercept = median_trunk_area),
    color = "#636363",
    linetype = "solid",
    size = 1
  ) +
  annotate("text",
           color = "#636363",
           x = 30,
           y = 13,
           label = paste("Median =", round(median_trunk_area, 2), "%"))

hist_base_canker / hist_trunk_canker

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 57 rows containing non-finite outside the scale range
## (`stat_bin()`).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 38 rows containing non-finite outside the scale range
## (`stat_bin()`).

density_trunk_canker <- health_assess_2024 %>% ggplot(aes(x = trunk_canker_area_2024)) +
  geom_density()
  # + aes(fill = has_callous_2024))
  # # Mean
  # geom_vline(
  #   aes(xintercept = mean_trunk_area),
  #   color = "#bdbdbd",
  #   linetype = "dashed",
  #   size = 1
  # ) +
  # annotate("text",
  #          color = "#bdbdbd",
  #          x = 0.1,
  #          y = 0.1, 
  #          label = paste("Mean =", round(mean_trunk_area, 2), "%")) +
  # 
  # # Median
  # geom_vline(
  #   aes(xintercept = median_trunk_area),
  #   color = "#636363",
  #   linetype = "solid",
  #   size = 1
  # ) +
  # annotate("text",
  #          color = "#636363",
  #          x = 0.5,
  #          y = 0.5,
  #          label = paste("Median =", round(median_trunk_area, 2), "%"))


density_trunk_canker

## Warning: Removed 38 rows containing non-finite outside the scale range
## (`stat_density()`).

# Linear regression model
model <- lm(base_canker_area_2024 ~ dbh_cm_2024, data = health_assess_2024)
coefs <- coef(model)
r_squared <- summary(model)$r.squared
equation <- paste("y = ", round(coefs[2], 2), "x", " + ", round(coefs[1], 2),  sep = "")
   
point_base_canker <- health_assess_2024 %>% ggplot(aes(x = dbh_cm_2024, y =
                                                         base_canker_area_2024)) +
  geom_point(aes(color = purdue_severity_2024)) +
  geom_smooth(method = lm) +
  annotate(
    "text",
    x = 30,
    y = 80,
    label = equation,
    hjust = 0
  ) + 
  annotate(
    "text",
    x = 30,
    y = 65,
    label = paste("R-squared = ", round(r_squared, 4)),
    hjust = 0
  )

# Linear regression model
model <- lm(trunk_canker_area_2024 ~ dbh_cm_2024, data = health_assess_2024)
coefs <- coef(model)
r_squared <- summary(model)$r.squared
equation <- paste("y = ", round(coefs[2], 2), "x", " + ", round(coefs[1], 2),  sep = "")

point_trunk_canker <- health_assess_2024 %>% ggplot(aes(x = dbh_cm_2024, y=trunk_canker_area_2024)) +
  geom_point(aes(color=purdue_severity_2024)) + 
  geom_smooth(method=lm) + 
    annotate(
    "text",
    x = 30,
    y = 80,
    label = equation,
    hjust = 0
  ) + 
  annotate(
    "text",
    x = 30,
    y = 65,
    label = paste("R-squared = ", round(r_squared, 4)),
    hjust = 0
  )

(point_base_canker / point_trunk_canker) + plot_layout(guides="collect")

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 89 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 89 rows containing missing values or values outside the scale range
## (`geom_point()`).

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 79 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 79 rows containing missing values or values outside the scale range
## (`geom_point()`).

health_assess_2024 %>%
  ggplot() +
  geom_point(aes(x=dbh_cm_2024, y=base_canker_area_2024, colour = purdue_severity_2024)) + 
  theme_classic() + 
  facet_wrap(~purdue_severity)

## Warning: Removed 89 rows containing missing values or values outside the scale range
## (`geom_point()`).

Canker severity

bar_canker_severity <- health_assess_2024 %>% ggplot(aes(x = purdue_severity_2024)) + 
  geom_bar(aes(fill = purdue_severity_2024))

bar_canker_severity

week6_2024_data_exploration

2025-07-03

Tidying data

Import

Select relevant WCP entries

Renaming columns & creating variables for readability

2024

Fixing typing

2024

Data exploration

Summary of data

Proportion of seedlings

Plant Height & DBH

% Live Canopy

Epicormics

Has Canker & Callous

Canker Areas

Canker severity

Summary Plots