Below is the pretreatment data taken from the TridentataMasterData excel file. Updated for 2024

##       plot          plantID          height          diameter     
##  Min.   :801.0   Min.   : 17.0   Min.   : 15.00   Min.   : 16.00  
##  1st Qu.:809.0   1st Qu.:144.5   1st Qu.: 60.50   1st Qu.: 73.08  
##  Median :815.0   Median :562.0   Median : 74.00   Median : 98.00  
##  Mean   :814.6   Mean   :426.5   Mean   : 73.45   Mean   :102.78  
##  3rd Qu.:821.0   3rd Qu.:690.5   3rd Qu.: 86.00   3rd Qu.:132.00  
##  Max.   :827.0   Max.   :900.0   Max.   :124.00   Max.   :250.00  
##                  NA's   :5       NA's   :5        NA's   :5       
##     diamter2         stem#         gallindex     
##  Min.   :  9.0   Min.   : 1.00   Min.   :0.0000  
##  1st Qu.: 50.0   1st Qu.: 2.00   1st Qu.:0.0000  
##  Median : 72.0   Median : 3.00   Median :0.0000  
##  Mean   : 75.9   Mean   : 3.36   Mean   :0.0404  
##  3rd Qu.: 95.5   3rd Qu.: 4.00   3rd Qu.:0.0000  
##  Max.   :229.0   Max.   :13.00   Max.   :1.0000  
##  NA's   :5       NA's   :5       NA's   :5

Cleaning up data

plant_data <- tibble(
  plant_ID = c(TridentataMasterData$plantID),
  height = c(TridentataMasterData$height),
  diameter = c(TridentataMasterData$diameter),
  diameter2 = c(TridentataMasterData$diamter2),
  stem_num = c(TridentataMasterData$`stem#`)
)
clip_data <- tibble(
  plot = c(Clipdata$Plot),
  corner = c(Clipdata$Letter),
  mass_grams = c(Clipdata$GrossMass)
)

Clean data: rename variables, find/correct/remove missing values, outlier ID

cleaned_data <- plant_data %>%
  rename(
    ID = plant_ID,
    Height_cm = height,
    Diameter_cm = diameter,
    Diameter2_cm = diameter2,
    Stem_Count = stem_num
  ) %>%
  mutate(
    Diameter_avg = rowMeans(select(., Diameter_cm, Diameter2_cm), na.rm = TRUE),
    Stem_Count = replace_na(Stem_Count, 0)
  )

identify_outliers <- function(data, column) {
  Q1 <- quantile(data[[column]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[column]], 0.75, na.rm = TRUE)
  IQR_value <- Q3 - Q1
  
  lower_bound <- Q1 - 1.5 * IQR_value
  upper_bound <- Q3 + 1.5 * IQR_value
  
  outliers <- data %>%
    filter((!!sym(column) < lower_bound) | (!!sym(column) > upper_bound))
  
  return(outliers)
}


height_outliers <- identify_outliers(cleaned_data, "Height_cm")
print(height_outliers)
## # A tibble: 1 × 6
##      ID Height_cm Diameter_cm Diameter2_cm Stem_Count Diameter_avg
##   <dbl>     <dbl>       <dbl>        <dbl>      <dbl>        <dbl>
## 1   560        15          16            9          1         12.5
mass_outliers <- identify_outliers(clip_data, "mass_grams")
print(mass_outliers)
## # A tibble: 2 × 3
##    plot corner mass_grams
##   <dbl> <chr>       <dbl>
## 1   806 A            42.1
## 2   813 A            52.6

Summary statistics on clean data