Data arrangement and cleaning

Clean data: rename variables, find/correct/remove missing values, outlier ID

cleaned_data <- plant_data %>%
  rename(
    ID = plant_ID,
    Height_cm = height,
    Diameter_cm = diameter,
    Diameter2_cm = diameter2,
    Stem_Count = stems,
    Resprout_Count = resprouts,
    Resprout_Length_cm = resprout_length,
    Gall_index = gall
  ) %>%
  mutate(
    Diameter_avg = rowMeans(select(., Diameter_cm, Diameter2_cm), na.rm = TRUE),
    Stem_Count = replace_na(Stem_Count, 0)
  )

identify_outliers <- function(data, column) {
  Q1 <- quantile(data[[column]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[column]], 0.75, na.rm = TRUE)
  IQR_value <- Q3 - Q1
  
  lower_bound <- Q1 - 1.5 * IQR_value
  upper_bound <- Q3 + 1.5 * IQR_value
  
  outliers <- data %>%
    filter((!!sym(column) < lower_bound) | (!!sym(column) > upper_bound))
  
  return(outliers)
}


height_outliers <- identify_outliers(cleaned_data, "Height_cm")
print(height_outliers)
## # A tibble: 0 × 9
## # ℹ 9 variables: ID <dbl>, Height_cm <dbl>, Diameter_cm <dbl>,
## #   Diameter2_cm <dbl>, Stem_Count <dbl>, Resprout_Count <chr>,
## #   Resprout_Length_cm <chr>, Gall_index <dbl>, Diameter_avg <dbl>
D1_outliers <- identify_outliers(cleaned_data, "Diameter_cm")
print(D1_outliers)
## # A tibble: 0 × 9
## # ℹ 9 variables: ID <dbl>, Height_cm <dbl>, Diameter_cm <dbl>,
## #   Diameter2_cm <dbl>, Stem_Count <dbl>, Resprout_Count <chr>,
## #   Resprout_Length_cm <chr>, Gall_index <dbl>, Diameter_avg <dbl>
D2_outliers <- identify_outliers(cleaned_data, "Diameter2_cm")
print(D2_outliers)
## # A tibble: 0 × 9
## # ℹ 9 variables: ID <dbl>, Height_cm <dbl>, Diameter_cm <dbl>,
## #   Diameter2_cm <dbl>, Stem_Count <dbl>, Resprout_Count <chr>,
## #   Resprout_Length_cm <chr>, Gall_index <dbl>, Diameter_avg <dbl>
stem_outliers <- identify_outliers(cleaned_data, "Stem_Count")
print(stem_outliers)
## # A tibble: 5 × 9
##      ID Height_cm Diameter_cm Diameter2_cm Stem_Count Resprout_Count
##   <dbl>     <dbl>       <dbl>        <dbl>      <dbl> <chr>         
## 1    14        54          58           38         18 >10           
## 2    19        53          74           47         18 >10           
## 3    27        50          42           32         23 >10           
## 4    29        58          40           30         13 >10           
## 5    37        57          91           81         14 >10           
## # ℹ 3 more variables: Resprout_Length_cm <chr>, Gall_index <dbl>,
## #   Diameter_avg <dbl>
Gall_outliers <- identify_outliers(cleaned_data, "Gall_index")
print(Gall_outliers)
## # A tibble: 10 × 9
##       ID Height_cm Diameter_cm Diameter2_cm Stem_Count Resprout_Count
##    <dbl>     <dbl>       <dbl>        <dbl>      <dbl> <chr>         
##  1     4       124         181          112          5 0             
##  2    12       130         134          108          3 1             
##  3    12       130         134          108          3 1             
##  4    19        53          74           47         18 >10           
##  5    24        71         129           70          2 0             
##  6   215        63         117           55          3 0             
##  7    37        57          91           81         14 >10           
##  8    38        36          27           23          8 8             
##  9    38        36          27           23          8 8             
## 10    40        70          81           74          4 0             
## # ℹ 3 more variables: Resprout_Length_cm <chr>, Gall_index <dbl>,
## #   Diameter_avg <dbl>

Summary statistics on clean data

## # A tibble: 1 × 4
##   Avg_Height Avg_Diameter Avg_Stem_Count Avg_Gall_Index
##        <dbl>        <dbl>          <dbl>          <dbl>
## 1       44.1         47.7            3.8          0.125

Data distributions (negative skew due to not dropping “zero” entries)